From 0b84da092ded1045c449e4fd131f0d03155644d4 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sat, 28 Mar 2026 22:52:31 +0900 Subject: [PATCH 01/18] Add 20 new CoreML models with sample apps New model categories: - Face Manipulation: LivePortrait, FOMM, Wav2Lip, SimSwap, 3DDFA_V2, DPR - Image Harmonization: CDTNet - Audio Source Separation: HTDemucs - Video Motion Magnification: STB-VMM - Image Deblurring: NAFNet - Image Classifiers: MobileNetV3, ConvNeXt, FastViT, MobileOne, etc. - Semantic Segmentation: DeepLabV3, LRASPP Includes 20 SwiftUI sample apps (creative_apps/ and sample_apps/). Model files (.mlpackage) are excluded - download from Google Drive. --- .gitignore | 15 + README.md | 208 +++++- .../CDTNetDemo.xcodeproj/project.pbxproj | 274 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../CDTNetDemo/Assets.xcassets/Contents.json | 6 + .../CDTNetDemo/CDTNetDemo/CDTNetDemoApp.swift | 10 + .../CDTNetDemo/CDTNetDemo/ContentView.swift | 466 ++++++++++++ .../CDTNetDemo/CDTNetDemo/Info.plist | 8 + .../DemucsDemo.xcodeproj/project.pbxproj | 274 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../DemucsDemo/Assets.xcassets/Contents.json | 6 + .../DemucsDemo/DemucsDemo/ContentView.swift | 466 ++++++++++++ .../DemucsDemo/DemucsDemo/DemucsDemoApp.swift | 10 + .../DemucsDemo/DemucsDemo/Info.plist | 8 + .../FOMMDemo.xcodeproj/project.pbxproj | 278 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../FOMMDemo/Assets.xcassets/Contents.json | 6 + .../FOMMDemo/FOMMDemo/ContentView.swift | 621 ++++++++++++++++ .../FOMMDemo/FOMMDemo/FOMMDemoApp.swift | 10 + creative_apps/FOMMDemo/FOMMDemo/Info.plist | 10 + .../Face3DDemo.xcodeproj/project.pbxproj | 272 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../Face3DDemo/Assets.xcassets/Contents.json | 6 + .../Face3DDemo/Face3DDemo/ContentView.swift | 643 ++++++++++++++++ .../Face3DDemo/Face3DDemo/Face3DDemoApp.swift | 10 + .../Face3DDemo/Face3DDemo/Info.plist | 10 + .../project.pbxproj | 286 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../LivePortraitDemo/ContentView.swift | 519 +++++++++++++ .../LivePortraitDemo/Info.plist | 10 + .../LivePortraitDemoApp.swift | 10 + .../MotionMagDemo.xcodeproj/project.pbxproj | 272 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../MotionMagDemo/ContentView.swift | 474 ++++++++++++ .../MotionMagDemo/MotionMagDemo/Info.plist | 10 + .../MotionMagDemo/MotionMagDemoApp.swift | 10 + .../NAFNetDemo.xcodeproj/project.pbxproj | 272 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../NAFNetDemo/Assets.xcassets/Contents.json | 6 + .../NAFNetDemo/NAFNetDemo/ContentView.swift | 462 ++++++++++++ .../NAFNetDemo/NAFNetDemo/Info.plist | 8 + .../NAFNetDemo/NAFNetDemo/NAFNetDemoApp.swift | 10 + .../RelightDemo.xcodeproj/project.pbxproj | 272 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../RelightDemo/Assets.xcassets/Contents.json | 6 + .../RelightDemo/RelightDemo/ContentView.swift | 697 ++++++++++++++++++ .../RelightDemo/RelightDemo/Info.plist | 8 + .../RelightDemo/RelightDemoApp.swift | 10 + .../SimSwapDemo.xcodeproj/project.pbxproj | 276 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../SimSwapDemo/Assets.xcassets/Contents.json | 6 + .../SimSwapDemo/SimSwapDemo/ContentView.swift | 689 +++++++++++++++++ .../SimSwapDemo/SimSwapDemo/Info.plist | 8 + .../SimSwapDemo/SimSwapDemoApp.swift | 10 + .../Wav2LipDemo.xcodeproj/project.pbxproj | 274 +++++++ .../AccentColor.colorset/Contents.json | 11 + .../Wav2LipDemo/Assets.xcassets/Contents.json | 6 + .../Wav2LipDemo/Wav2LipDemo/ContentView.swift | 648 ++++++++++++++++ .../Wav2LipDemo/Wav2LipDemo/Info.plist | 12 + .../Wav2LipDemo/Wav2LipDemoApp.swift | 10 + .../project.pbxproj | 342 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../ConvNeXtTinyDemo/ContentView.swift | 225 ++++++ .../ConvNeXtTinyDemoApp.swift | 10 + .../ConvNeXtTinyDemo/ImageNetLabels.swift | 95 +++ .../ConvNeXtTinyDemo/Info.plist | 6 + .../DeepLabV3Demo.xcodeproj/project.pbxproj | 344 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../DeepLabV3Demo/ContentView.swift | 371 ++++++++++ .../DeepLabV3Demo/DeepLabV3DemoApp.swift | 10 + .../DeepLabV3Demo/ImageNetLabels.swift | 95 +++ .../DeepLabV3Demo/DeepLabV3Demo/Info.plist | 8 + .../project.pbxproj | 342 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../EfficientFormerV2Demo/ContentView.swift | 306 ++++++++ .../EfficientFormerV2DemoApp.swift | 10 + .../ImageNetLabels.swift | 95 +++ .../EfficientFormerV2Demo/Info.plist | 8 + .../FastViTDemo.xcodeproj/project.pbxproj | 344 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../FastViTDemo/Assets.xcassets/Contents.json | 6 + .../FastViTDemo/FastViTDemo/ContentView.swift | 454 ++++++++++++ .../FastViTDemo/FastViTDemoApp.swift | 10 + .../FastViTDemo/ImageNetLabels.swift | 95 +++ .../FastViTDemo/FastViTDemo/Info.plist | 8 + .../GhostNetV2Demo.xcodeproj/project.pbxproj | 342 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../GhostNetV2Demo/ContentView.swift | 343 +++++++++ .../GhostNetV2Demo/GhostNetV2DemoApp.swift | 10 + .../GhostNetV2Demo/ImageNetLabels.swift | 95 +++ .../GhostNetV2Demo/GhostNetV2Demo/Info.plist | 8 + .../LRASPPDemo.xcodeproj/project.pbxproj | 342 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../LRASPPDemo/Assets.xcassets/Contents.json | 6 + .../LRASPPDemo/LRASPPDemo/ContentView.swift | 317 ++++++++ sample_apps/LRASPPDemo/LRASPPDemo/Info.plist | 8 + .../LRASPPDemo/LRASPPDemo/LRASPPDemoApp.swift | 10 + .../LRASPPDemo/LRASPPDemo/VOCLabels.swift | 56 ++ .../LeViTDemo.xcodeproj/project.pbxproj | 344 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../LeViTDemo/Assets.xcassets/Contents.json | 6 + .../LeViTDemo/LeViTDemo/ContentView.swift | 408 ++++++++++ .../LeViTDemo/LeViTDemo/ImageNetLabels.swift | 95 +++ sample_apps/LeViTDemo/LeViTDemo/Info.plist | 10 + .../LeViTDemo/LeViTDemo/LeViTDemoApp.swift | 10 + .../project.pbxproj | 344 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../MobileNetV3SmallDemo/ContentView.swift | 241 ++++++ .../MobileNetV3SmallDemo/ImageNetLabels.swift | 95 +++ .../MobileNetV3SmallDemo/Info.plist | 8 + .../MobileNetV3SmallDemoApp.swift | 10 + .../MobileOneDemo.xcodeproj/project.pbxproj | 344 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../MobileOneDemo/ContentView.swift | 379 ++++++++++ .../MobileOneDemo/ImageNetLabels.swift | 95 +++ .../MobileOneDemo/MobileOneDemo/Info.plist | 8 + .../MobileOneDemo/MobileOneDemoApp.swift | 10 + .../PoolFormerDemo.xcodeproj/project.pbxproj | 342 +++++++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../PoolFormerDemo/ContentView.swift | 377 ++++++++++ .../PoolFormerDemo/ImageNetLabels.swift | 95 +++ .../PoolFormerDemo/PoolFormerDemo/Info.plist | 8 + .../PoolFormerDemo/PoolFormerDemoApp.swift | 10 + 137 files changed, 17196 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/CDTNetDemoApp.swift create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/ContentView.swift create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/Info.plist create mode 100644 creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/DemucsDemo/DemucsDemo/ContentView.swift create mode 100644 creative_apps/DemucsDemo/DemucsDemo/DemucsDemoApp.swift create mode 100644 creative_apps/DemucsDemo/DemucsDemo/Info.plist create mode 100644 creative_apps/FOMMDemo/FOMMDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/FOMMDemo/FOMMDemo/ContentView.swift create mode 100644 creative_apps/FOMMDemo/FOMMDemo/FOMMDemoApp.swift create mode 100644 creative_apps/FOMMDemo/FOMMDemo/Info.plist create mode 100644 creative_apps/Face3DDemo/Face3DDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/Face3DDemo/Face3DDemo/ContentView.swift create mode 100644 creative_apps/Face3DDemo/Face3DDemo/Face3DDemoApp.swift create mode 100644 creative_apps/Face3DDemo/Face3DDemo/Info.plist create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/Info.plist create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/LivePortraitDemoApp.swift create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/ContentView.swift create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/Info.plist create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/MotionMagDemoApp.swift create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/ContentView.swift create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/Info.plist create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/NAFNetDemoApp.swift create mode 100644 creative_apps/RelightDemo/RelightDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/RelightDemo/RelightDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/RelightDemo/RelightDemo/ContentView.swift create mode 100644 creative_apps/RelightDemo/RelightDemo/Info.plist create mode 100644 creative_apps/RelightDemo/RelightDemo/RelightDemoApp.swift create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/ContentView.swift create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/Info.plist create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/SimSwapDemoApp.swift create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/ContentView.swift create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/Info.plist create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/Wav2LipDemoApp.swift create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ContentView.swift create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ConvNeXtTinyDemoApp.swift create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ImageNetLabels.swift create mode 100644 sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Info.plist create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo.xcodeproj/project.pbxproj create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/Contents.json create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/ContentView.swift create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/DeepLabV3DemoApp.swift create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/ImageNetLabels.swift create mode 100644 sample_apps/DeepLabV3Demo/DeepLabV3Demo/Info.plist create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo.xcodeproj/project.pbxproj create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/Contents.json create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ContentView.swift create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/EfficientFormerV2DemoApp.swift create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ImageNetLabels.swift create mode 100644 sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Info.plist create mode 100644 sample_apps/FastViTDemo/FastViTDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/FastViTDemo/FastViTDemo/ContentView.swift create mode 100644 sample_apps/FastViTDemo/FastViTDemo/FastViTDemoApp.swift create mode 100644 sample_apps/FastViTDemo/FastViTDemo/ImageNetLabels.swift create mode 100644 sample_apps/FastViTDemo/FastViTDemo/Info.plist create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo.xcodeproj/project.pbxproj create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/Contents.json create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/ContentView.swift create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/GhostNetV2DemoApp.swift create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/ImageNetLabels.swift create mode 100644 sample_apps/GhostNetV2Demo/GhostNetV2Demo/Info.plist create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/ContentView.swift create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/Info.plist create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/LRASPPDemoApp.swift create mode 100644 sample_apps/LRASPPDemo/LRASPPDemo/VOCLabels.swift create mode 100644 sample_apps/LeViTDemo/LeViTDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/LeViTDemo/LeViTDemo/ContentView.swift create mode 100644 sample_apps/LeViTDemo/LeViTDemo/ImageNetLabels.swift create mode 100644 sample_apps/LeViTDemo/LeViTDemo/Info.plist create mode 100644 sample_apps/LeViTDemo/LeViTDemo/LeViTDemoApp.swift create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/ContentView.swift create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/ImageNetLabels.swift create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Info.plist create mode 100644 sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/MobileNetV3SmallDemoApp.swift create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/ContentView.swift create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/ImageNetLabels.swift create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/Info.plist create mode 100644 sample_apps/MobileOneDemo/MobileOneDemo/MobileOneDemoApp.swift create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/ContentView.swift create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/ImageNetLabels.swift create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/Info.plist create mode 100644 sample_apps/PoolFormerDemo/PoolFormerDemo/PoolFormerDemoApp.swift diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4e76ca2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# CoreML model files (download from Google Drive) +*.mlpackage +*.mlmodel +*.mlmodelc +*.mlpackage/ + +# Converted models directory +converted_models/ +creative_models/ + +# Python conversion scripts +convert_all.py +convert_remaining.py +__pycache__/ +*.pyc diff --git a/README.md b/README.md index c181b8c..1f542f3 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,14 @@ You are free to do or not. - [RepVGG](#repvgg) - [RegNet](#regnet) - [MobileViTv2](#mobilevitv2) + - [MobileNetV3-Small](#mobilenetv3-small) + - [ConvNeXt-Tiny](#convnext-tiny) + - [FastViT-T8](#fastvit-t8) + - [MobileOne-S0](#mobileone-s0) + - [EfficientFormerV2-S0](#efficientformerv2-s0) + - [GhostNetV2-100](#ghostnetv2-100) + - [PoolFormer-S12](#poolformer-s12) + - [LeViT-128S](#levit-128s) - [**Object Detection**](#object-detection) @@ -48,6 +56,8 @@ You are free to do or not. - [Semantic FPN](#semantic-fpn) - [cloths_segmentation](#cloths_segmentation) - [easyportrait](#easyportrait) + - [DeepLabV3-MobileNetV3](#deeplabv3-mobilenetv3) + - [LRASPP-MobileNetV3](#lraspp-mobilenetv3) - [**Super Resolution**](#super-resolution) - [Real ESRGAN](#real-esrgan) @@ -106,6 +116,26 @@ You are free to do or not. - [Openjourney](#openjourney) - [dreamlike-photoreal-2.0](#dreamlike-photoreal-2) +- [**Face Manipulation**](#face-manipulation) **:NEW** + - [LivePortrait](#liveportrait) + - [FOMM](#fomm) + - [Wav2Lip](#wav2lip) + - [SimSwap](#simswap) + - [3DDFA_V2](#3ddfa_v2) + - [DPR Portrait Relighting](#dpr-portrait-relighting) + +- [**Image Harmonization**](#image-harmonization) **:NEW** + - [CDTNet](#cdtnet) + +- [**Audio Source Separation**](#audio-source-separation) **:NEW** + - [HTDemucs](#htdemucs) + +- [**Video Motion Magnification**](#video-motion-magnification) **:NEW** + - [STB-VMM](#stb-vmm) + +- [**Image Deblurring**](#image-deblurring) **:NEW** + - [NAFNet](#nafnet) + # How to get the model You can get the model converted to CoreML format from the link of Google drive. See the section below for how to use it in Xcode. @@ -191,6 +221,70 @@ CVNets: A library for training computer vision networks | ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | | [MobileViTv2](https://drive.google.com/file/d/1__aG67p6o5-NIchkHpfFJBszCpIhI0uf/view?usp=share_link) | 18.8 MB | ImageNet | [apple/ml-cvnets](https://github.com/apple/ml-cvnets) | [apple](https://github.com/apple/ml-cvnets/blob/main/LICENSE)|2022|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]([https://colab.research.google.com/drive/1QiTlFsN948Xt2e4WgqUB8DnGgwWwtVZS?usp=sharing](https://colab.research.google.com/drive/1UQwhFpVP_4Q9I6LXPdBSS0VDhIRdUBQA?usp=sharing)) | +### MobileNetV3-Small + +Lightweight classification model optimized for mobile devices. Ultra-fast inference with 67.7% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| MobileNetV3-Small (TBD) | 4.9 MB | ImageNet | [pytorch/vision](https://github.com/pytorch/vision) | [BSD-3](https://github.com/pytorch/vision/blob/main/LICENSE)|2019| [MobileNetV3SmallDemo](sample_apps/MobileNetV3SmallDemo) | + +### ConvNeXt-Tiny + +A ConvNet for the 2020s. Pure CNN architecture that competes with Vision Transformers. 82.5% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| ConvNeXt-Tiny (TBD) | 54.6 MB | ImageNet | [facebookresearch/ConvNeXt](https://github.com/facebookresearch/ConvNeXt) | [MIT](https://github.com/facebookresearch/ConvNeXt/blob/main/LICENSE)|2022| [ConvNeXtTinyDemo](sample_apps/ConvNeXtTinyDemo) | + +### FastViT-T8 + +Apple's hybrid vision transformer. Ultra-fast inference with structural reparameterization. 76.2% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| FastViT-T8 (TBD) | 7.8 MB | ImageNet | [apple/ml-fastvit](https://github.com/apple/ml-fastvit) | [Apple](https://github.com/apple/ml-fastvit/blob/main/LICENSE)|2023| [FastViTDemo](sample_apps/FastViTDemo) | + +### MobileOne-S0 + +Apple's sub-millisecond mobile backbone. Optimized for on-device inference with reparameterizable architecture. 71.4% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| MobileOne-S0 (TBD) | 10.4 MB | ImageNet | [apple/ml-mobileone](https://github.com/apple/ml-mobileone) | [Apple](https://github.com/apple/ml-mobileone/blob/main/LICENSE)|2022| [MobileOneDemo](sample_apps/MobileOneDemo) | + +### EfficientFormerV2-S0 + +Rethinking Vision Transformers for MobileNet Size and Speed. Lightweight ViT for mobile. 76.2% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| EfficientFormerV2-S0 (TBD) | 7.2 MB | ImageNet | [snap-research/EfficientFormer](https://github.com/snap-research/EfficientFormer) | [Apache2.0](https://github.com/snap-research/EfficientFormer/blob/main/LICENSE)|2023| [EfficientFormerV2Demo](sample_apps/EfficientFormerV2Demo) | + +### GhostNetV2-100 + +GhostNetV2: Enhance Cheap Operation with Long-Range Attention. Ghost module with DFC attention. 75.3% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| GhostNetV2-100 (TBD) | 11.9 MB | ImageNet | [huawei-noah/Efficient-AI-Backbones](https://github.com/huawei-noah/Efficient-AI-Backbones) | [Apache2.0](https://github.com/huawei-noah/Efficient-AI-Backbones/blob/master/LICENSE)|2022| [GhostNetV2Demo](sample_apps/GhostNetV2Demo) | + +### PoolFormer-S12 + +MetaFormer is Actually What You Need for Vision. Uses simple pooling instead of attention. 77.2% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| PoolFormer-S12 (TBD) | 22.9 MB | ImageNet | [sail-sg/poolformer](https://github.com/sail-sg/poolformer) | [Apache2.0](https://github.com/sail-sg/poolformer/blob/main/LICENSE)|2022| [PoolFormerDemo](sample_apps/PoolFormerDemo) | + +### LeViT-128S + +LeViT: A Vision Transformer in ConvNet's Clothing. Fast hybrid CNN-Transformer. 76.6% top-1 accuracy. + +| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | +| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | +| LeViT-128S (TBD) | 16.0 MB | ImageNet | [facebookresearch/LeViT](https://github.com/facebookresearch/LeViT) | [Apache2.0](https://github.com/facebookresearch/LeViT/blob/main/LICENSE)|2021| [LeViTDemo](sample_apps/LeViTDemo) | + # Object Detection ### YOLOv5s @@ -354,6 +448,22 @@ EasyPortrait - Face Parsing and Portrait Segmentation Dataset. | ------------- | ------------- | ------------- |------------- | ------------- | ------------- |------------- |------------- | | [easyportrait-segformer512-fp](https://drive.google.com/drive/folders/13BUhNpQHodAgcj6eJaPbzuSUaFn3JuU-?usp=sharing) | 7.6 MB | Image(GrayScale 512x512) * 9 |[hukenovs/easyportrait](https://github.com/hukenovs/easyportrait) | [Creative Commons](https://github.com/hukenovs/easyportrait/tree/main/license) |2023|[easyportrait-coreml](https://github.com/john-rocky/easyportrait-coreml)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/11a3XWFA8fa8V0a2zgWFqOMUaZgF4O1qt?usp=sharing)| +### DeepLabV3-MobileNetV3 + +DeepLabV3 with MobileNetV3-Large backbone. 21-class PASCAL VOC semantic segmentation (person, car, cat, dog, etc.). + +| Google Drive Link | Size | Output |Original Project | License | Year | Sample Project | +| ------------- | ------------- | ------------- |------------- | ------------- | ------------- |------------- | +| DeepLabV3-MobileNetV3 (TBD) | 21.1 MB | MultiArray (1x21x512x512) | [pytorch/vision](https://github.com/pytorch/vision) | [BSD-3](https://github.com/pytorch/vision/blob/main/LICENSE) |2019| [DeepLabV3Demo](sample_apps/DeepLabV3Demo) | + +### LRASPP-MobileNetV3 + +Lite R-ASPP with MobileNetV3-Large backbone. Ultra-lightweight 21-class semantic segmentation (57.9 mIoU). Only 6.3 MB. + +| Google Drive Link | Size | Output |Original Project | License | Year | Sample Project | +| ------------- | ------------- | ------------- |------------- | ------------- | ------------- |------------- | +| LRASPP-MobileNetV3 (TBD) | 6.3 MB | MultiArray (1x21x512x512) | [pytorch/vision](https://github.com/pytorch/vision) | [BSD-3](https://github.com/pytorch/vision/blob/main/LICENSE) |2019| [LRASPPDemo](sample_apps/LRASPPDemo) | + # Super Resolution ### [Real ESRGAN](https://drive.google.com/file/d/1cpm-x12Ih7Cqd_kOjfTvtt4ipGS3BpCx/view?usp=sharing) @@ -850,10 +960,106 @@ model_fp16 = quantization_utils.quantize_weights(model_fp32, nbits=16) +# Face Manipulation + +### LivePortrait + +Portrait Animation (Kuaishou, 2024). Animate any portrait photo with expression transfer from a driving video. Multi-model pipeline. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [LivePortrait_MotionExtractor (TBD)] | 54 MB | 256x256 image | keypoints, pose, expression | [KwaiVGI/LivePortrait](https://github.com/KwaiVGI/LivePortrait) | MIT | 2024 | [LivePortraitDemo](creative_apps/LivePortraitDemo) | +| [LivePortrait_AppearanceExtractor (TBD)] | 1.6 MB | 256x256 image | 3D feature volume | | | | | +| [LivePortrait_WarpingNetwork (TBD)] | 91 MB | features + keypoints | warped features | | | | | +| [LivePortrait_SPADEGenerator (TBD)] | 106 MB | warped features | 512x512 output | | | | | + +### FOMM + +First Order Motion Model. Face reenactment -- transfer facial expressions and head pose from one person to another. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [FOMM_KPDetector (TBD)] | 27 MB | 256x256 image | 10 keypoints + Jacobians | [AliaksandrSiarohin/first-order-model](https://github.com/AliaksandrSiarohin/first-order-model) | MIT | 2019 | [FOMMDemo](creative_apps/FOMMDemo) | +| [FOMM_Generator (TBD)] | 87 MB | source + keypoint pairs | 256x256 output | | | | | + +### Wav2Lip + +Audio-Driven Talking Head. Make any portrait speak from audio input. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [Wav2Lip (TBD)] | 69 MB | face(6ch,96x96) + mel(1,1,80,16) | lip-synced face(96x96) | [Rudrabha/Wav2Lip](https://github.com/Rudrabha/Wav2Lip) | See repo | 2020 | [Wav2LipDemo](creative_apps/Wav2LipDemo) | + +### SimSwap + +Face Swap. Transfer face identity between photos using ArcFace embeddings + generator. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [SimSwap_ArcFace (TBD)] | 100 MB | 112x112 face | 512-d identity embedding | [neuralchen/SimSwap](https://github.com/neuralchen/SimSwap) | See repo | 2020 | [SimSwapDemo](creative_apps/SimSwapDemo) | +| [SimSwap_Generator (TBD)] | 105 MB | 224x224 target + 512-d id | 224x224 swapped face | | | | | + +### 3DDFA_V2 + +3D Dense Face Alignment. Reconstruct 3D face mesh from single photo using MobileNet backbone (only 6.3 MB). + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [3DDFA_V2 (TBD)] | 6.3 MB | 120x120 face | 62 3DMM params (pose+shape+expression) | [cleardusk/3DDFA_V2](https://github.com/cleardusk/3DDFA_V2) | MIT | 2020 | [Face3DDemo](creative_apps/Face3DDemo) | + +### DPR Portrait Relighting + +Deep Portrait Relighting. Change lighting direction in portraits using Spherical Harmonics. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [DPR_Relighting (TBD)] | 1.4 MB | 512x512 luminance + 9 SH coefficients | relit portrait | [zhhoper/DPR](https://github.com/zhhoper/DPR) | See repo | 2019 | [RelightDemo](creative_apps/RelightDemo) | + +# Image Harmonization + +### CDTNet + +Color-Dual-Transformer Network. Make composited foreground objects blend naturally with the background. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [CDTNet_Harmonization (TBD)] | 5.4 MB | 256x256 composite + mask | harmonized image | [bcmi/CDTNet](https://github.com/bcmi/CDTNet-High-Resolution-Image-Harmonization) | See repo | 2022 | [CDTNetDemo](creative_apps/CDTNetDemo) | + +# Audio Source Separation + +### HTDemucs + +Hybrid Transformer Demucs by Meta. Separate music into 4 stems: vocals, drums, bass, other. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [HTDemucs (TBD)] | 100 MB | STFT freq(1,8,2049,336) + waveform(1,2,343980) | 4 separated stems | [facebookresearch/demucs](https://github.com/facebookresearch/demucs) | MIT | 2023 | [DemucsDemo](creative_apps/DemucsDemo) | + +Note: STFT/iSTFT must be performed app-side using Accelerate/vDSP. See sample app for integration details. + +# Video Motion Magnification + +### STB-VMM + +Swin Transformer Based Video Motion Magnification. Amplify invisible micro-motions in video (e.g., visualize heartbeat, structural vibrations). + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [STB_VMM (TBD)] | 65 MB | 2 frames(384x384) + magnification factor | magnified frame(384x384) | [RLado/STB-VMM](https://github.com/RLado/STB-VMM) | GPL-3.0 | 2023 | [MotionMagDemo](creative_apps/MotionMagDemo) | + +# Image Deblurring + +### NAFNet + +Nonlinear Activation Free Network. State-of-the-art image deblurring without nonlinear activation functions. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [NAFNet_Deblur (TBD)] | 130 MB | 256x256 blurry image | 256x256 deblurred image | [megvii-research/NAFNet](https://github.com/megvii-research/NAFNet) | MIT | 2022 | [NAFNetDemo](creative_apps/NAFNetDemo) | # Thanks -Cover image was taken from Ghibli free images. +Cover image was taken from Ghibli free images. On YOLOv5 convertion, [dbsystel/yolov5-coreml-tools](https://github.com/dbsystel/yolov5-coreml-tools) give me the super inteligent convert script. diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..1ff237f --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj @@ -0,0 +1,274 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + A10001 /* CDTNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10002; }; + A10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10004; }; + A10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10006; }; + A1CD02 /* CDTNet_Harmonization.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = A1CD01; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + A10007 /* CDTNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = CDTNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + A10002 /* CDTNetDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CDTNetDemoApp.swift; sourceTree = ""; }; + A10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + A10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + A10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + A1CD01 /* CDTNet_Harmonization.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = CDTNet_Harmonization.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + A10009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + A10010 = { + isa = PBXGroup; + children = ( + A10011 /* CDTNetDemo */, + A10012 /* Products */, + ); + sourceTree = ""; + }; + A10011 /* CDTNetDemo */ = { + isa = PBXGroup; + children = ( + A10002 /* CDTNetDemoApp.swift */, + A10004 /* ContentView.swift */, + A10006 /* Assets.xcassets */, + A10008 /* Info.plist */, + A1CD01 /* CDTNet_Harmonization.mlpackage */, + ); + path = CDTNetDemo; + sourceTree = ""; + }; + A10012 /* Products */ = { + isa = PBXGroup; + children = ( + A10007 /* CDTNetDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + A10013 /* CDTNetDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = A10014; + buildPhases = ( + A10015 /* Sources */, + A10009 /* Frameworks */, + A10016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = CDTNetDemo; + productName = CDTNetDemo; + productReference = A10007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + A10017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + A10013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = A10018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = A10010; + productRefGroup = A10012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + A10013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + A10016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + A10015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10001 /* CDTNetDemoApp.swift in Sources */, + A10003 /* ContentView.swift in Sources */, + A1CD02 /* CDTNet_Harmonization.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + A10019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + A10020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + A10021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = CDTNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.cdtnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + A10022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = CDTNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.cdtnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + A10018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10019, + A10020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + A10014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10021, + A10022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = A10017; +} diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/Contents.json b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/CDTNetDemoApp.swift b/creative_apps/CDTNetDemo/CDTNetDemo/CDTNetDemoApp.swift new file mode 100644 index 0000000..e14bf54 --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/CDTNetDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct CDTNetDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/ContentView.swift b/creative_apps/CDTNetDemo/CDTNetDemo/ContentView.swift new file mode 100644 index 0000000..5739bfc --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/ContentView.swift @@ -0,0 +1,466 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - Image Harmonization using CDTNet +// CDTNet takes a composite image and a mask indicating the foreground region, +// then produces a harmonized image where the foreground blends naturally with the background. + +struct ContentView: View { + @StateObject private var viewModel = HarmonizationViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Composite image picker + Section { + PhotosPicker(selection: $viewModel.selectedPhoto, + matching: .images) { + if let image = viewModel.compositeImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 250) + .cornerRadius(12) + } else { + placeholderView(title: "Select Composite Image", + systemImage: "photo.on.rectangle") + } + } + } header: { + sectionHeader("Composite Image") + } + + // Mask region selector + if viewModel.compositeImage != nil { + Section { + VStack(spacing: 10) { + Text("Drag to select foreground region (mask)") + .font(.caption) + .foregroundColor(.secondary) + + MaskSelectionView( + image: viewModel.compositeImage!, + maskRect: $viewModel.normalizedMaskRect + ) + .frame(height: 250) + .cornerRadius(12) + } + } header: { + sectionHeader("Mask Selection") + } + } + + // Harmonize button + if viewModel.compositeImage != nil { + Button(action: { viewModel.harmonize() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "wand.and.stars") + } + Text("Harmonize") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isProcessing ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isProcessing) + } + + // Error display + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // Before / After comparison + if viewModel.harmonizedImage != nil { + Section { + BeforeAfterView( + before: viewModel.compositeImage!, + after: viewModel.harmonizedImage! + ) + .frame(height: 300) + .cornerRadius(12) + } header: { + sectionHeader("Result: Before / After") + } + } + } + .padding() + } + .navigationTitle("CDTNet Harmonization") + } + } + + private func sectionHeader(_ title: String) -> some View { + HStack { + Text(title) + .font(.headline) + Spacer() + } + } + + private func placeholderView(title: String, systemImage: String) -> some View { + VStack(spacing: 12) { + Image(systemName: systemImage) + .font(.system(size: 40)) + .foregroundColor(.secondary) + Text(title) + .foregroundColor(.secondary) + } + .frame(maxWidth: .infinity) + .frame(height: 180) + .background(Color(.systemGray6)) + .cornerRadius(12) + } +} + +// MARK: - ViewModel + +class HarmonizationViewModel: ObservableObject { + @Published var selectedPhoto: PhotosPickerItem? { + didSet { loadImage() } + } + @Published var compositeImage: UIImage? + @Published var harmonizedImage: UIImage? + @Published var normalizedMaskRect: CGRect = CGRect(x: 0.25, y: 0.25, width: 0.5, height: 0.5) + @Published var isProcessing = false + @Published var errorMessage: String? + + private func loadImage() { + guard let item = selectedPhoto else { return } + Task { + if let data = try? await item.loadTransferable(type: Data.self), + let image = UIImage(data: data) { + await MainActor.run { + self.compositeImage = image + self.harmonizedImage = nil + self.errorMessage = nil + } + } + } + } + + func harmonize() { + guard let inputImage = compositeImage else { return } + isProcessing = true + errorMessage = nil + + Task { + do { + let result = try await performHarmonization(image: inputImage, maskRect: normalizedMaskRect) + await MainActor.run { + self.harmonizedImage = result + self.isProcessing = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + } + } + + // Perform harmonization using CDTNet CoreML model + // Input: composite_image (1,3,256,256) + mask (1,1,256,256) -> harmonized (1,3,256,256) + private func performHarmonization(image: UIImage, maskRect: CGRect) async throws -> UIImage { + // Load the CoreML model + guard let modelURL = Bundle.main.url(forResource: "CDTNet_Harmonization", withExtension: "mlmodelc") else { + throw HarmonizationError.modelNotFound( + "CDTNet_Harmonization.mlmodelc not found in bundle. " + + "Please compile and add the CDTNet_Harmonization.mlpackage to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + // Prepare composite image input (1, 3, 256, 256) + let targetSize = CGSize(width: 256, height: 256) + guard let resizedCG = image.resized(to: targetSize)?.cgImage else { + throw HarmonizationError.imageProcessingFailed("Failed to resize composite image") + } + + let compositeArray = try MLMultiArray(shape: [1, 3, 256, 256], dataType: .float32) + fillMultiArrayFromImage(resizedCG, into: compositeArray) + + // Prepare mask input (1, 1, 256, 256) from the rectangular selection + let maskArray = try MLMultiArray(shape: [1, 1, 256, 256], dataType: .float32) + fillMaskArray(maskArray, rect: maskRect) + + // Run inference + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "composite_image": MLFeatureValue(multiArray: compositeArray), + "mask": MLFeatureValue(multiArray: maskArray) + ]) + let prediction = try model.prediction(from: inputFeatures) + + // Extract harmonized output (1, 3, 256, 256) + guard let outputArray = prediction.featureValue(for: "harmonized_image")?.multiArrayValue else { + throw HarmonizationError.imageProcessingFailed("Failed to extract harmonized output") + } + + let resultImage = imageFromMultiArray(outputArray, width: 256, height: 256) + guard let finalImage = resultImage else { + throw HarmonizationError.imageProcessingFailed("Failed to convert output to UIImage") + } + return finalImage + } + + // Fill MLMultiArray with pixel data from CGImage (RGB, normalized 0-1) + private func fillMultiArrayFromImage(_ cgImage: CGImage, into array: MLMultiArray) { + let width = 256 + let height = 256 + let bytesPerPixel = 4 + let bytesPerRow = bytesPerPixel * width + var pixelData = [UInt8](repeating: 0, count: width * height * bytesPerPixel) + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: width, height: height, + bitsPerComponent: 8, bytesPerRow: bytesPerRow, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + for y in 0..= x0 && x < x1 && y >= y0 && y < y1) ? 1.0 : 0.0 + array[[0, 0, y, x] as [NSNumber]] = NSNumber(value: value) + } + } + } + + // Convert (1, 3, 256, 256) MLMultiArray back to UIImage + private func imageFromMultiArray(_ array: MLMultiArray, width: Int, height: Int) -> UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + + for y in 0.. UIImage? { + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + self.draw(in: CGRect(origin: .zero, size: targetSize)) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/Info.plist b/creative_apps/CDTNetDemo/CDTNetDemo/Info.plist new file mode 100644 index 0000000..fcdae98 --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select composite images for harmonization. + + diff --git a/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..3761e41 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj @@ -0,0 +1,274 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + B10001 /* DemucsDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10002; }; + B10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10004; }; + B10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B10006; }; + B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B1DM01; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + B10007 /* DemucsDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DemucsDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + B10002 /* DemucsDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DemucsDemoApp.swift; sourceTree = ""; }; + B10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + B10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + B10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + B1DM01 /* HTDemucs_SourceSeparation.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = HTDemucs_SourceSeparation.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + B10009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + B10010 = { + isa = PBXGroup; + children = ( + B10011 /* DemucsDemo */, + B10012 /* Products */, + ); + sourceTree = ""; + }; + B10011 /* DemucsDemo */ = { + isa = PBXGroup; + children = ( + B10002 /* DemucsDemoApp.swift */, + B10004 /* ContentView.swift */, + B10006 /* Assets.xcassets */, + B10008 /* Info.plist */, + B1DM01 /* HTDemucs_SourceSeparation.mlpackage */, + ); + path = DemucsDemo; + sourceTree = ""; + }; + B10012 /* Products */ = { + isa = PBXGroup; + children = ( + B10007 /* DemucsDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + B10013 /* DemucsDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = B10014; + buildPhases = ( + B10015 /* Sources */, + B10009 /* Frameworks */, + B10016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = DemucsDemo; + productName = DemucsDemo; + productReference = B10007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + B10017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + B10013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = B10018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = B10010; + productRefGroup = B10012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + B10013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + B10016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B10005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + B10015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B10001 /* DemucsDemoApp.swift in Sources */, + B10003 /* ContentView.swift in Sources */, + B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + B10019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + B10020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + B10021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DemucsDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.demucsdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + B10022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DemucsDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.demucsdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + B10018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B10019, + B10020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + B10014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B10021, + B10022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = B10017; +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/Contents.json b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift new file mode 100644 index 0000000..7d11bf6 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift @@ -0,0 +1,466 @@ +import SwiftUI +import UIKit +import CoreML +import AVFoundation +import UniformTypeIdentifiers + +// MARK: - HTDemucs Audio Source Separation Demo +// +// HTDemucs separates audio into 4 stems: Vocals, Drums, Bass, Other. +// +// IMPORTANT: The model operates in the frequency domain. +// In a production app, you must perform STFT (Short-Time Fourier Transform) on the input +// audio to produce the freq_input (1,8,2049,336) tensor, and also provide the raw +// time_input (1,2,343980) waveform. After inference, the frequency and time domain +// outputs must be combined via iSTFT (Inverse STFT) to reconstruct each stem's waveform. +// +// This demo uses simplified/placeholder audio processing to demonstrate the UI flow. +// A full implementation would require an STFT library (e.g., Accelerate vDSP). + +enum Stem: String, CaseIterable, Identifiable { + case vocals = "Vocals" + case drums = "Drums" + case bass = "Bass" + case other = "Other" + + var id: String { rawValue } + + var icon: String { + switch self { + case .vocals: return "mic.fill" + case .drums: return "drum.fill" + case .bass: return "guitars.fill" + case .other: return "waveform" + } + } + + var color: Color { + switch self { + case .vocals: return .purple + case .drums: return .orange + case .bass: return .blue + case .other: return .green + } + } +} + +struct ContentView: View { + @StateObject private var viewModel = DemucsViewModel() + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + // Audio import section + VStack(spacing: 16) { + if let fileName = viewModel.audioFileName { + HStack { + Image(systemName: "music.note") + .font(.title2) + .foregroundColor(.accentColor) + VStack(alignment: .leading) { + Text(fileName) + .font(.headline) + .lineLimit(1) + if let duration = viewModel.audioDuration { + Text(formatDuration(duration)) + .font(.caption) + .foregroundColor(.secondary) + } + } + Spacer() + Button("Change") { + viewModel.showFilePicker = true + } + .font(.caption) + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + } else { + Button(action: { viewModel.showFilePicker = true }) { + VStack(spacing: 12) { + Image(systemName: "square.and.arrow.down") + .font(.system(size: 36)) + .foregroundColor(.secondary) + Text("Import Audio File") + .foregroundColor(.secondary) + Text("WAV, MP3, M4A, AAC") + .font(.caption2) + .foregroundColor(.secondary.opacity(0.7)) + } + .frame(maxWidth: .infinity) + .frame(height: 140) + .background(Color(.systemGray6)) + .cornerRadius(12) + } + } + } + .padding() + + // Separation button + if viewModel.audioURL != nil && !viewModel.isSeparated { + Button(action: { viewModel.separate() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "scissors") + } + Text(viewModel.isProcessing ? "Separating..." : "Separate Stems") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isProcessing ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isProcessing) + .padding(.horizontal) + } + + // Progress + if viewModel.isProcessing { + VStack(spacing: 8) { + ProgressView(value: viewModel.progress) + .progressViewStyle(.linear) + Text(viewModel.statusMessage) + .font(.caption) + .foregroundColor(.secondary) + } + .padding() + } + + // Error + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .frame(maxWidth: .infinity) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + // Stem controls + if viewModel.isSeparated { + VStack(spacing: 12) { + Text("Separated Stems") + .font(.headline) + .frame(maxWidth: .infinity, alignment: .leading) + + ForEach(Stem.allCases) { stem in + StemPlayerView( + stem: stem, + isPlaying: viewModel.playingStem == stem, + onPlay: { viewModel.playStem(stem) }, + onStop: { viewModel.stopPlayback() } + ) + } + } + .padding() + } + + Spacer() + + // Waveform visualization placeholder + if viewModel.isSeparated { + WaveformView(activeStem: viewModel.playingStem) + .frame(height: 80) + .padding() + } + } + .navigationTitle("Demucs Separator") + .sheet(isPresented: $viewModel.showFilePicker) { + AudioFilePickerView(audioURL: $viewModel.audioURL) + } + } + } + + private func formatDuration(_ duration: TimeInterval) -> String { + let minutes = Int(duration) / 60 + let seconds = Int(duration) % 60 + return String(format: "%d:%02d", minutes, seconds) + } +} + +// MARK: - Stem Player Row + +struct StemPlayerView: View { + let stem: Stem + let isPlaying: Bool + let onPlay: () -> Void + let onStop: () -> Void + + var body: some View { + HStack(spacing: 16) { + Image(systemName: stem.icon) + .font(.title3) + .foregroundColor(stem.color) + .frame(width: 30) + + Text(stem.rawValue) + .font(.body) + .fontWeight(.medium) + + Spacer() + + // Volume indicator + HStack(spacing: 2) { + ForEach(0..<5) { i in + RoundedRectangle(cornerRadius: 1) + .fill(isPlaying ? stem.color : Color(.systemGray4)) + .frame(width: 3, height: CGFloat(8 + i * 4)) + } + } + + Button(action: { + if isPlaying { + onStop() + } else { + onPlay() + } + }) { + Image(systemName: isPlaying ? "stop.circle.fill" : "play.circle.fill") + .font(.title) + .foregroundColor(isPlaying ? .red : stem.color) + } + } + .padding() + .background( + RoundedRectangle(cornerRadius: 12) + .fill(isPlaying ? stem.color.opacity(0.1) : Color(.systemGray6)) + ) + } +} + +// MARK: - Animated Waveform + +struct WaveformView: View { + let activeStem: Stem? + @State private var phase: CGFloat = 0 + + var body: some View { + TimelineView(.animation) { timeline in + Canvas { context, size in + let color = activeStem?.color ?? .gray + let midY = size.height / 2 + let amplitude = activeStem != nil ? size.height * 0.35 : size.height * 0.1 + let time = timeline.date.timeIntervalSinceReferenceDate + + var path = Path() + path.move(to: CGPoint(x: 0, y: midY)) + for x in stride(from: 0, through: size.width, by: 2) { + let normalizedX = x / size.width + let y = midY + sin(normalizedX * .pi * 6 + time * 3) * amplitude * + (0.5 + 0.5 * sin(normalizedX * .pi * 2 + time * 1.5)) + path.addLine(to: CGPoint(x: x, y: y)) + } + + context.stroke(path, with: .color(color.opacity(0.7)), lineWidth: 2) + } + } + } +} + +// MARK: - Audio File Picker + +struct AudioFilePickerView: UIViewControllerRepresentable { + @Binding var audioURL: URL? + @Environment(\.dismiss) private var dismiss + + func makeUIViewController(context: Context) -> UIDocumentPickerViewController { + let types: [UTType] = [.audio, .mp3, .wav, .aiff, UTType("public.mpeg-4-audio") ?? .audio] + let picker = UIDocumentPickerViewController(forOpeningContentTypes: types) + picker.delegate = context.coordinator + picker.allowsMultipleSelection = false + return picker + } + + func updateUIViewController(_ uiViewController: UIDocumentPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, UIDocumentPickerDelegate { + let parent: AudioFilePickerView + + init(_ parent: AudioFilePickerView) { + self.parent = parent + } + + func documentPicker(_ controller: UIDocumentPickerViewController, didPickDocumentsAt urls: [URL]) { + parent.audioURL = urls.first + parent.dismiss() + } + + func documentPickerWasCancelled(_ controller: UIDocumentPickerViewController) { + parent.dismiss() + } + } +} + +// MARK: - ViewModel + +class DemucsViewModel: ObservableObject { + @Published var audioURL: URL? { + didSet { updateAudioInfo() } + } + @Published var audioFileName: String? + @Published var audioDuration: TimeInterval? + @Published var showFilePicker = false + @Published var isProcessing = false + @Published var isSeparated = false + @Published var progress: Double = 0 + @Published var statusMessage = "" + @Published var errorMessage: String? + @Published var playingStem: Stem? + + private var audioPlayer: AVAudioPlayer? + + private func updateAudioInfo() { + guard let url = audioURL else { + audioFileName = nil + audioDuration = nil + isSeparated = false + return + } + + _ = url.startAccessingSecurityScopedResource() + audioFileName = url.lastPathComponent + isSeparated = false + + let asset = AVURLAsset(url: url) + Task { + let duration = try? await asset.load(.duration) + await MainActor.run { + self.audioDuration = duration?.seconds + } + } + } + + func separate() { + guard audioURL != nil else { return } + isProcessing = true + errorMessage = nil + progress = 0 + + Task { + do { + try await performSeparation() + await MainActor.run { + self.isSeparated = true + self.isProcessing = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + } + } + + // Perform source separation using HTDemucs CoreML model + // NOTE: Full implementation requires: + // 1. Load audio waveform (stereo, ~344k samples at 44.1kHz ~ 7.8s segment) + // 2. Compute STFT to get freq_input (1,8,2049,336) - 8 channels = real+imag for 4 encoder inputs + // 3. Provide time_input (1,2,343980) - raw stereo waveform + // 4. Run model inference + // 5. Apply iSTFT on frequency outputs + combine with time outputs for each of 4 stems + // 6. Overlap-add for segments longer than ~7.8s + private func performSeparation() async throws { + await updateStatus("Loading model...", progress: 0.1) + + // Check for model + guard let modelURL = Bundle.main.url(forResource: "HTDemucs_SourceSeparation", withExtension: "mlmodelc") else { + throw DemucsError.modelNotFound( + "HTDemucs_SourceSeparation.mlmodelc not found in bundle. " + + "Please compile and add the HTDemucs_SourceSeparation.mlpackage to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await updateStatus("Computing STFT...", progress: 0.3) + + // STFT placeholder: In production, use Accelerate's vDSP to compute + // the Short-Time Fourier Transform of the input audio. + // Window size = 4096, hop = 1024, producing 2049 frequency bins x 336 time frames + // The 8 channels represent real and imaginary parts for the hybrid architecture. + + let freqInput = try MLMultiArray(shape: [1, 8, 2049, 336], dataType: .float32) + let timeInput = try MLMultiArray(shape: [1, 2, 343980], dataType: .float32) + + // Fill with placeholder data (in production: actual STFT values and waveform) + // ...zero-initialized by default + + await updateStatus("Running inference...", progress: 0.5) + + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "freq_input": MLFeatureValue(multiArray: freqInput), + "time_input": MLFeatureValue(multiArray: timeInput) + ]) + + let _ = try model.prediction(from: inputFeatures) + + await updateStatus("Reconstructing stems (iSTFT)...", progress: 0.8) + + // iSTFT placeholder: In production, apply inverse STFT on each stem's + // frequency output and combine with time-domain output. + // Each stem produces separate freq and time outputs that are summed. + // Use overlap-add for audio longer than one segment (~7.8s at 44.1kHz). + + await updateStatus("Complete!", progress: 1.0) + } + + @MainActor + private func updateStatus(_ message: String, progress: Double) { + self.statusMessage = message + self.progress = progress + } + + func playStem(_ stem: Stem) { + // In production, play the separated stem audio buffer + // For demo, we play the original audio or show the concept + stopPlayback() + playingStem = stem + + guard let url = audioURL else { return } + _ = url.startAccessingSecurityScopedResource() + do { + try AVAudioSession.sharedInstance().setCategory(.playback) + try AVAudioSession.sharedInstance().setActive(true) + audioPlayer = try AVAudioPlayer(contentsOf: url) + audioPlayer?.play() + } catch { + errorMessage = "Playback error: \(error.localizedDescription)" + } + } + + func stopPlayback() { + audioPlayer?.stop() + audioPlayer = nil + playingStem = nil + } +} + +enum DemucsError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/DemucsDemoApp.swift b/creative_apps/DemucsDemo/DemucsDemo/DemucsDemoApp.swift new file mode 100644 index 0000000..a66c5e7 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/DemucsDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct DemucsDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/Info.plist b/creative_apps/DemucsDemo/DemucsDemo/Info.plist new file mode 100644 index 0000000..cd0b810 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSMicrophoneUsageDescription + This app may use the microphone to record audio for source separation. + + diff --git a/creative_apps/FOMMDemo/FOMMDemo.xcodeproj/project.pbxproj b/creative_apps/FOMMDemo/FOMMDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..22b48d0 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo.xcodeproj/project.pbxproj @@ -0,0 +1,278 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + D10001 /* FOMMDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = D10002; }; + D10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D10004; }; + D10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = D10006; }; + D1FM02 /* FOMM_KPDetector.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = D1FM01; }; + D1FM04 /* FOMM_Generator.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = D1FM03; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + D10007 /* FOMMDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = FOMMDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + D10002 /* FOMMDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FOMMDemoApp.swift; sourceTree = ""; }; + D10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + D10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + D10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + D1FM01 /* FOMM_KPDetector.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = FOMM_KPDetector.mlpackage; sourceTree = ""; }; + D1FM03 /* FOMM_Generator.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = FOMM_Generator.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + D10009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + D10010 = { + isa = PBXGroup; + children = ( + D10011 /* FOMMDemo */, + D10012 /* Products */, + ); + sourceTree = ""; + }; + D10011 /* FOMMDemo */ = { + isa = PBXGroup; + children = ( + D10002 /* FOMMDemoApp.swift */, + D10004 /* ContentView.swift */, + D10006 /* Assets.xcassets */, + D10008 /* Info.plist */, + D1FM01 /* FOMM_KPDetector.mlpackage */, + D1FM03 /* FOMM_Generator.mlpackage */, + ); + path = FOMMDemo; + sourceTree = ""; + }; + D10012 /* Products */ = { + isa = PBXGroup; + children = ( + D10007 /* FOMMDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + D10013 /* FOMMDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = D10014; + buildPhases = ( + D10015 /* Sources */, + D10009 /* Frameworks */, + D10016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = FOMMDemo; + productName = FOMMDemo; + productReference = D10007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + D10017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + D10013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = D10018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = D10010; + productRefGroup = D10012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + D10013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + D10016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D10005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + D10015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D10001 /* FOMMDemoApp.swift in Sources */, + D10003 /* ContentView.swift in Sources */, + D1FM02 /* FOMM_KPDetector.mlpackage in Sources */, + D1FM04 /* FOMM_Generator.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + D10019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + D10020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + D10021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = FOMMDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.fommdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + D10022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = FOMMDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.fommdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + D10018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D10019, + D10020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + D10014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D10021, + D10022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = D10017; +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/Contents.json b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/ContentView.swift b/creative_apps/FOMMDemo/FOMMDemo/ContentView.swift new file mode 100644 index 0000000..dff42e6 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/ContentView.swift @@ -0,0 +1,621 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - FOMM (First Order Motion Model) Face Reenactment Demo +// +// Two-model pipeline: +// 1. FOMM_KPDetector: Detects 10 facial keypoints + 2x2 Jacobian matrices +// Input: image (1,3,256,256) +// Output: keypoints (1,10,2) + jacobians (1,10,2,2) +// +// 2. FOMM_Generator: Generates reenacted face from source + keypoint pairs +// Input: source_image (1,3,256,256) + source/driving keypoints & jacobians +// Output: prediction (1,3,256,256) + +struct ContentView: View { + @StateObject private var viewModel = FOMMViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Source and Driving image pickers side by side + HStack(spacing: 12) { + // Source face + VStack(spacing: 8) { + sectionHeader("Source Face") + PhotosPicker(selection: $viewModel.selectedSourcePhoto, + matching: .images) { + if let image = viewModel.sourceImage { + ZStack { + Image(uiImage: image) + .resizable() + .scaledToFill() + .frame(width: 150, height: 150) + .clipped() + .cornerRadius(12) + + // Keypoint overlay on source + if !viewModel.sourceKeypoints.isEmpty { + KeypointOverlay( + keypoints: viewModel.sourceKeypoints, + color: .green + ) + .frame(width: 150, height: 150) + } + } + } else { + placeholderView( + systemImage: "person.crop.square", + size: 150 + ) + } + } + } + + // Driving face + VStack(spacing: 8) { + sectionHeader("Driving Face") + PhotosPicker(selection: $viewModel.selectedDrivingPhoto, + matching: .images) { + if let image = viewModel.drivingImage { + ZStack { + Image(uiImage: image) + .resizable() + .scaledToFill() + .frame(width: 150, height: 150) + .clipped() + .cornerRadius(12) + + // Keypoint overlay on driving + if !viewModel.drivingKeypoints.isEmpty { + KeypointOverlay( + keypoints: viewModel.drivingKeypoints, + color: .orange + ) + .frame(width: 150, height: 150) + } + } + } else { + placeholderView( + systemImage: "person.crop.square.filled.and.at.rectangle", + size: 150 + ) + } + } + } + } + + // Detect keypoints button + if viewModel.sourceImage != nil && viewModel.drivingImage != nil { + Button(action: { viewModel.detectKeypoints() }) { + HStack { + if viewModel.isDetectingKeypoints { + ProgressView() + .tint(.white) + } else { + Image(systemName: "dot.radiowaves.left.and.right") + } + Text("Detect Keypoints") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isDetectingKeypoints ? Color.gray : Color.blue) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isDetectingKeypoints) + } + + // Keypoint info + if !viewModel.sourceKeypoints.isEmpty { + VStack(alignment: .leading, spacing: 6) { + Text("Detected Keypoints") + .font(.headline) + + HStack(spacing: 20) { + VStack(alignment: .leading) { + Text("Source: \(viewModel.sourceKeypoints.count) points") + .foregroundColor(.green) + Text("+ \(viewModel.sourceKeypoints.count) Jacobians (2x2)") + .font(.caption) + .foregroundColor(.secondary) + } + Spacer() + VStack(alignment: .leading) { + Text("Driving: \(viewModel.drivingKeypoints.count) points") + .foregroundColor(.orange) + Text("+ \(viewModel.drivingKeypoints.count) Jacobians (2x2)") + .font(.caption) + .foregroundColor(.secondary) + } + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + } + + // Generate button + if !viewModel.sourceKeypoints.isEmpty && !viewModel.drivingKeypoints.isEmpty { + Button(action: { viewModel.generateReenactment() }) { + HStack { + if viewModel.isGenerating { + ProgressView() + .tint(.white) + } else { + Image(systemName: "face.smiling") + } + Text("Generate Reenactment") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isGenerating ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isGenerating) + } + + // Error display + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .frame(maxWidth: .infinity) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // Result + if let result = viewModel.resultImage { + Section { + VStack(spacing: 12) { + Text("Reenacted Face") + .font(.headline) + .frame(maxWidth: .infinity, alignment: .leading) + + Image(uiImage: result) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + + // Comparison row + HStack(spacing: 8) { + if let src = viewModel.sourceImage { + VStack { + Image(uiImage: src) + .resizable() + .scaledToFill() + .frame(width: 80, height: 80) + .clipped() + .cornerRadius(8) + Text("Source") + .font(.caption2) + } + } + Image(systemName: "plus") + .foregroundColor(.secondary) + if let drv = viewModel.drivingImage { + VStack { + Image(uiImage: drv) + .resizable() + .scaledToFill() + .frame(width: 80, height: 80) + .clipped() + .cornerRadius(8) + Text("Driving") + .font(.caption2) + } + } + Image(systemName: "arrow.right") + .foregroundColor(.secondary) + VStack { + Image(uiImage: result) + .resizable() + .scaledToFill() + .frame(width: 80, height: 80) + .clipped() + .cornerRadius(8) + Text("Result") + .font(.caption2) + } + } + } + } + } + } + .padding() + } + .navigationTitle("FOMM Reenactment") + } + } + + private func sectionHeader(_ title: String) -> some View { + Text(title) + .font(.caption) + .fontWeight(.semibold) + .foregroundColor(.secondary) + } + + private func placeholderView(systemImage: String, size: CGFloat) -> some View { + VStack(spacing: 8) { + Image(systemName: systemImage) + .font(.system(size: 30)) + .foregroundColor(.secondary) + Text("Select") + .font(.caption) + .foregroundColor(.secondary) + } + .frame(width: size, height: size) + .background(Color(.systemGray6)) + .cornerRadius(12) + } +} + +// MARK: - Keypoint Overlay + +struct KeypointOverlay: View { + let keypoints: [CGPoint] + let color: Color + + var body: some View { + GeometryReader { geo in + ForEach(0..= 2 else { return } + for i in 0.. ([CGPoint], [CGPoint]) { + guard let modelURL = Bundle.main.url(forResource: "FOMM_KPDetector", withExtension: "mlmodelc") else { + throw FOMMError.modelNotFound( + "FOMM_KPDetector.mlmodelc not found in bundle. " + + "Please compile and add the FOMM_KPDetector.mlpackage to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + // Detect source keypoints + let sourceArray = try imageToMultiArray(source) + let sourceInput = try MLDictionaryFeatureProvider(dictionary: [ + "image": MLFeatureValue(multiArray: sourceArray) + ]) + let sourceOutput = try model.prediction(from: sourceInput) + + guard let sourceKPArray = sourceOutput.featureValue(for: "keypoints")?.multiArrayValue else { + throw FOMMError.processingFailed("Failed to extract source keypoints") + } + let sourceKP = extractKeypoints(from: sourceKPArray) + + // Detect driving keypoints + let drivingArray = try imageToMultiArray(driving) + let drivingInput = try MLDictionaryFeatureProvider(dictionary: [ + "image": MLFeatureValue(multiArray: drivingArray) + ]) + let drivingOutput = try model.prediction(from: drivingInput) + + guard let drivingKPArray = drivingOutput.featureValue(for: "keypoints")?.multiArrayValue else { + throw FOMMError.processingFailed("Failed to extract driving keypoints") + } + let drivingKP = extractKeypoints(from: drivingKPArray) + + return (sourceKP, drivingKP) + } + + // Extract 10 keypoints from (1,10,2) MLMultiArray + private func extractKeypoints(from array: MLMultiArray) -> [CGPoint] { + var points: [CGPoint] = [] + for i in 0..<10 { + let x = CGFloat(array[[0, i, 0] as [NSNumber]].floatValue) + let y = CGFloat(array[[0, i, 1] as [NSNumber]].floatValue) + // Normalize from [-1, 1] to [0, 1] + let normX = (x + 1.0) / 2.0 + let normY = (y + 1.0) / 2.0 + points.append(CGPoint(x: normX, y: normY)) + } + return points + } + + func generateReenactment() { + guard sourceImage != nil else { return } + isGenerating = true + errorMessage = nil + + Task { + do { + let result = try await runGeneration() + await MainActor.run { + self.resultImage = result + self.isGenerating = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isGenerating = false + } + } + } + } + + // Generate reenacted face using FOMM_Generator + // Input: source_image (1,3,256,256) + keypoint data + // Output: prediction (1,3,256,256) + private func runGeneration() async throws -> UIImage { + guard let modelURL = Bundle.main.url(forResource: "FOMM_Generator", withExtension: "mlmodelc") else { + throw FOMMError.modelNotFound( + "FOMM_Generator.mlmodelc not found in bundle. " + + "Please compile and add the FOMM_Generator.mlpackage to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + guard let source = sourceImage else { + throw FOMMError.processingFailed("Source image not available") + } + + let sourceArray = try imageToMultiArray(source) + + // Prepare keypoint arrays + let srcKPArray = try keypointsToMultiArray(sourceKeypoints) + let drvKPArray = try keypointsToMultiArray(drivingKeypoints) + + // Prepare Jacobian arrays (1,10,2,2) + let srcJacobians = try MLMultiArray(shape: [1, 10, 2, 2], dataType: .float32) + let drvJacobians = try MLMultiArray(shape: [1, 10, 2, 2], dataType: .float32) + // Initialize Jacobians as identity matrices + for i in 0..<10 { + srcJacobians[[0, i, 0, 0] as [NSNumber]] = 1.0 + srcJacobians[[0, i, 1, 1] as [NSNumber]] = 1.0 + drvJacobians[[0, i, 0, 0] as [NSNumber]] = 1.0 + drvJacobians[[0, i, 1, 1] as [NSNumber]] = 1.0 + } + + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "source_image": MLFeatureValue(multiArray: sourceArray), + "source_keypoints": MLFeatureValue(multiArray: srcKPArray), + "driving_keypoints": MLFeatureValue(multiArray: drvKPArray), + "source_jacobians": MLFeatureValue(multiArray: srcJacobians), + "driving_jacobians": MLFeatureValue(multiArray: drvJacobians) + ]) + + let output = try model.prediction(from: inputFeatures) + + guard let predictionArray = output.featureValue(for: "prediction")?.multiArrayValue else { + throw FOMMError.processingFailed("Failed to extract prediction output") + } + + guard let resultImage = imageFromMultiArray(predictionArray, width: 256, height: 256) else { + throw FOMMError.processingFailed("Failed to convert prediction to image") + } + + return resultImage + } + + // Convert UIImage to (1,3,256,256) MLMultiArray + private func imageToMultiArray(_ image: UIImage) throws -> MLMultiArray { + let width = 256 + let height = 256 + guard let resized = image.resized(to: CGSize(width: width, height: height)), + let cgImage = resized.cgImage else { + throw FOMMError.processingFailed("Failed to resize image") + } + + let array = try MLMultiArray(shape: [1, 3, 256, 256], dataType: .float32) + let bytesPerPixel = 4 + let bytesPerRow = bytesPerPixel * width + var pixelData = [UInt8](repeating: 0, count: width * height * bytesPerPixel) + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: width, height: height, + bitsPerComponent: 8, bytesPerRow: bytesPerRow, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { + throw FOMMError.processingFailed("Failed to create CGContext") + } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + for y in 0.. MLMultiArray { + let array = try MLMultiArray(shape: [1, 10, 2], dataType: .float32) + for i in 0.. UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + + for y in 0.. UIImage? { + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + self.draw(in: CGRect(origin: .zero, size: targetSize)) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/FOMMDemoApp.swift b/creative_apps/FOMMDemo/FOMMDemo/FOMMDemoApp.swift new file mode 100644 index 0000000..504dadf --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/FOMMDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct FOMMDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/Info.plist b/creative_apps/FOMMDemo/FOMMDemo/Info.plist new file mode 100644 index 0000000..f4bfe28 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select source and driving face images. + NSCameraUsageDescription + This app may use the camera to capture driving expressions for face reenactment. + + diff --git a/creative_apps/Face3DDemo/Face3DDemo.xcodeproj/project.pbxproj b/creative_apps/Face3DDemo/Face3DDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..2b51ec3 --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo.xcodeproj/project.pbxproj @@ -0,0 +1,272 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + D40000010000000000000001 /* Face3DDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000001; }; + D40000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000002; }; + D40000010000000000000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000003; }; + D4000001000000000000D001 /* 3DDFA_V2.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = D4000002000000000000D001; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + D40000020000000000000000 /* Face3DDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Face3DDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + D40000020000000000000001 /* Face3DDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Face3DDemoApp.swift; sourceTree = ""; }; + D40000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + D40000020000000000000003 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + D40000020000000000000004 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + D4000002000000000000D001 /* 3DDFA_V2.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = 3DDFA_V2.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + D40000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + D40000040000000000000000 = { + isa = PBXGroup; + children = ( + D40000040000000000000001 /* Face3DDemo */, + D40000040000000000000002 /* Products */, + ); + sourceTree = ""; + }; + D40000040000000000000001 /* Face3DDemo */ = { + isa = PBXGroup; + children = ( + D40000020000000000000001 /* Face3DDemoApp.swift */, + D40000020000000000000002 /* ContentView.swift */, + D40000020000000000000003 /* Assets.xcassets */, + D40000020000000000000004 /* Info.plist */, + D4000002000000000000D001 /* 3DDFA_V2.mlpackage */, + ); + path = Face3DDemo; + sourceTree = ""; + }; + D40000040000000000000002 /* Products */ = { + isa = PBXGroup; + children = ( + D40000020000000000000000 /* Face3DDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + D40000050000000000000001 /* Face3DDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = D40000070000000000000001; + buildPhases = ( + D40000060000000000000001 /* Sources */, + D40000030000000000000001 /* Frameworks */, + D40000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = Face3DDemo; + productName = Face3DDemo; + productReference = D40000020000000000000000; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + D40000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + D40000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = D40000070000000000000002; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = D40000040000000000000000; + productRefGroup = D40000040000000000000002; + projectDirPath = ""; + projectRoot = ""; + targets = ( + D40000050000000000000001 /* Face3DDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + D40000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000010000000000000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + D40000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000010000000000000001 /* Face3DDemoApp.swift in Sources */, + D40000010000000000000002 /* ContentView.swift in Sources */, + D4000001000000000000D001 /* 3DDFA_V2.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + D40000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + D40000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + D40000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = Face3DDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.face3ddemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + D40000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = Face3DDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.face3ddemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + D40000070000000000000001 /* Build configuration list for PBXNativeTarget "Face3DDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000090000000000000003 /* Debug */, + D40000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + D40000070000000000000002 /* Build configuration list for PBXProject "Face3DDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000090000000000000001 /* Debug */, + D40000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = D40000080000000000000001 /* Project object */; +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/Contents.json b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/ContentView.swift b/creative_apps/Face3DDemo/Face3DDemo/ContentView.swift new file mode 100644 index 0000000..3b66e5c --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/ContentView.swift @@ -0,0 +1,643 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - 3DMM Parameter Categories + +/// Decomposed 3D Morphable Model parameters from the model output +struct Face3DMMParams { + // 12 pose parameters (rotation, translation, scale) + var pose: [Float] = [] // indices 0-11 + // 40 shape parameters (identity basis coefficients) + var shape: [Float] = [] // indices 12-51 + // 10 expression parameters + var expression: [Float] = [] // indices 52-61 + + /// Euler angles extracted from the pose parameters (approximated) + var pitch: Float { pose.count >= 3 ? pose[0] * 180.0 / .pi : 0 } + var yaw: Float { pose.count >= 3 ? pose[1] * 180.0 / .pi : 0 } + var roll: Float { pose.count >= 3 ? pose[2] * 180.0 / .pi : 0 } + + /// Expression labels for display + static let expressionLabels = [ + "Mouth Open", "Smile", "Brow Raise", "Brow Furrow", + "Eye Close", "Lip Stretch", "Lip Press", "Jaw Drop", + "Cheek Puff", "Nose Wrinkle" + ] +} + +// MARK: - Face 3D Processor + +/// Processes face images through the 3DDFA_V2 CoreML model +class Face3DProcessor: ObservableObject { + @Published var inputImage: UIImage? + @Published var faceCrop: UIImage? + @Published var params: Face3DMMParams? + @Published var isProcessing = false + @Published var errorMessage: String? + @Published var faceLandmarks: [CGPoint] = [] + + private var model: MLModel? + private let inputSize = 120 + + init() { + loadModel() + } + + private func loadModel() { + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + guard let modelURL = Bundle.main.url(forResource: "3DDFA_V2", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Please add 3DDFA_V2.mlmodelc to the project bundle." + return + } + model = try MLModel(contentsOf: modelURL, configuration: config) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + /// Detect face using Vision and return bounding box + private func detectFace(in image: UIImage) async throws -> (CGRect, [CGPoint])? { + guard let cgImage = image.cgImage else { return nil } + + return try await withCheckedThrowingContinuation { continuation in + let request = VNDetectFaceLandmarksRequest { request, error in + if let error = error { + continuation.resume(throwing: error) + return + } + guard let face = (request.results as? [VNFaceObservation])?.first else { + continuation.resume(returning: nil) + return + } + + // Extract landmark points for overlay + var landmarks: [CGPoint] = [] + if let allPoints = face.landmarks?.allPoints { + let imageWidth = CGFloat(cgImage.width) + let imageHeight = CGFloat(cgImage.height) + for point in allPoints.normalizedPoints { + let x = face.boundingBox.origin.x * imageWidth + point.x * face.boundingBox.width * imageWidth + let y = (1.0 - face.boundingBox.origin.y - face.boundingBox.height) * imageHeight + (1.0 - point.y) * face.boundingBox.height * imageHeight + landmarks.append(CGPoint(x: x / imageWidth, y: y / imageHeight)) + } + } + continuation.resume(returning: (face.boundingBox, landmarks)) + } + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try handler.perform([request]) + } catch { + continuation.resume(throwing: error) + } + } + } + + /// Crop face to 120x120 for model input + private func cropFace(from image: UIImage, boundingBox: CGRect) -> UIImage? { + guard let cgImage = image.cgImage else { return nil } + + let imageWidth = CGFloat(cgImage.width) + let imageHeight = CGFloat(cgImage.height) + + let x = boundingBox.origin.x * imageWidth + let y = (1.0 - boundingBox.origin.y - boundingBox.height) * imageHeight + let w = boundingBox.width * imageWidth + let h = boundingBox.height * imageHeight + + // Square crop with padding + let side = max(w, h) * 1.3 + let centerX = x + w / 2 + let centerY = y + h / 2 + let cropRect = CGRect( + x: max(0, centerX - side / 2), + y: max(0, centerY - side / 2), + width: min(imageWidth, side), + height: min(imageHeight, side) + ) + + guard let croppedCGImage = cgImage.cropping(to: cropRect) else { return nil } + + let targetSize = CGSize(width: inputSize, height: inputSize) + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + UIImage(cgImage: croppedCGImage).draw(in: CGRect(origin: .zero, size: targetSize)) + } + } + + /// Convert UIImage to CHW float array normalized to [0, 1] + private func imageToFloatArray(_ image: UIImage) -> [Float]? { + guard let cgImage = image.cgImage else { return nil } + + let size = inputSize + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: size * size * 4) + + guard let context = CGContext( + data: &pixelData, + width: size, + height: size, + bitsPerComponent: 8, + bytesPerRow: size * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + + var floatData = [Float](repeating: 0, count: 3 * size * size) + for y in 0.. + let color: Color + + private var normalizedValue: Double { + let clamped = max(range.lowerBound, min(range.upperBound, value)) + return Double((clamped - range.lowerBound) / (range.upperBound - range.lowerBound)) + } + + var body: some View { + VStack(spacing: 4) { + ZStack { + Circle() + .trim(from: 0, to: 0.75) + .stroke(Color(.systemGray4), lineWidth: 4) + .rotationEffect(.degrees(135)) + + Circle() + .trim(from: 0, to: min(0.75, normalizedValue * 0.75)) + .stroke(color, lineWidth: 4) + .rotationEffect(.degrees(135)) + + Text(String(format: "%.1f", value)) + .font(.system(size: 10, weight: .bold, design: .monospaced)) + } + .frame(width: 50, height: 50) + + Text(label) + .font(.system(size: 8)) + .foregroundColor(.secondary) + .lineLimit(1) + .minimumScaleFactor(0.7) + } + } +} + +// MARK: - Face Overlay View + +/// Draws landmark points on top of the face image +struct FaceLandmarkOverlay: View { + let image: UIImage + let landmarks: [CGPoint] + + var body: some View { + GeometryReader { geometry in + ZStack { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(width: geometry.size.width, height: geometry.size.height) + + // Draw landmarks + ForEach(0.. PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .images + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: ImagePicker + + init(_ parent: ImagePicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.canLoadObject(ofClass: UIImage.self) else { return } + provider.loadObject(ofClass: UIImage.self) { image, _ in + DispatchQueue.main.async { + self.parent.image = image as? UIImage + } + } + } + } +} + +// MARK: - Camera Picker + +struct CameraPicker: UIViewControllerRepresentable { + @Binding var image: UIImage? + + func makeUIViewController(context: Context) -> UIImagePickerController { + let picker = UIImagePickerController() + picker.sourceType = .camera + picker.cameraDevice = .front + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, UIImagePickerControllerDelegate, UINavigationControllerDelegate { + let parent: CameraPicker + + init(_ parent: CameraPicker) { + self.parent = parent + } + + func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) { + picker.dismiss(animated: true) + if let image = info[.originalImage] as? UIImage { + DispatchQueue.main.async { + self.parent.image = image + } + } + } + + func imagePickerControllerDidCancel(_ picker: UIImagePickerController) { + picker.dismiss(animated: true) + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = Face3DProcessor() + @State private var showImagePicker = false + @State private var showCamera = false + @State private var pickedImage: UIImage? + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Input buttons + HStack(spacing: 12) { + Button { + showCamera = true + } label: { + Label("Camera", systemImage: "camera.fill") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(12) + } + + Button { + showImagePicker = true + } label: { + Label("Photos", systemImage: "photo.on.rectangle") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.green) + .foregroundColor(.white) + .cornerRadius(12) + } + } + .padding(.horizontal) + + // Processing indicator + if processor.isProcessing { + ProgressView("Analyzing face...") + .padding() + } + + // Face image with landmarks + if let image = processor.inputImage { + VStack(spacing: 8) { + Text("Detected Face with Landmarks") + .font(.headline) + FaceLandmarkOverlay(image: image, landmarks: processor.faceLandmarks) + .frame(height: 250) + .cornerRadius(12) + .padding(.horizontal) + } + } + + // Cropped face + if let crop = processor.faceCrop { + VStack(spacing: 8) { + Text("Cropped Face (120x120)") + .font(.caption.bold()) + .foregroundColor(.secondary) + Image(uiImage: crop) + .resizable() + .interpolation(.none) + .scaledToFit() + .frame(width: 120, height: 120) + .cornerRadius(8) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.orange, lineWidth: 2) + ) + } + } + + // 3DMM Parameters + if let params = processor.params { + parametersSection(params) + } + + Spacer(minLength: 40) + } + .padding(.vertical) + } + .navigationTitle("3D Face Reconstruction") + .sheet(isPresented: $showImagePicker) { + ImagePicker(image: $pickedImage) + } + .sheet(isPresented: $showCamera) { + CameraPicker(image: $pickedImage) + } + .onChange(of: pickedImage) { newValue in + guard let image = newValue else { return } + Task { + await processor.processImage(image) + } + } + } + } + + // MARK: - Subviews + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "cube.transparent") + .font(.system(size: 50)) + .foregroundColor(.orange) + Text("3D Face Reconstruction") + .font(.title2.bold()) + Text("Extract 3DMM parameters: pose, shape, and expression") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + private func errorBanner(_ message: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + private func parametersSection(_ params: Face3DMMParams) -> some View { + VStack(spacing: 16) { + // Pose (rotation angles) + VStack(alignment: .leading, spacing: 8) { + Text("Head Pose (Rotation)") + .font(.headline) + + HStack(spacing: 16) { + GaugeView(label: "Pitch", value: params.pitch, range: -90...90, color: .red) + GaugeView(label: "Yaw", value: params.yaw, range: -90...90, color: .green) + GaugeView(label: "Roll", value: params.roll, range: -90...90, color: .blue) + } + .frame(maxWidth: .infinity) + + // Pose parameter sliders + ForEach(0..) -> Double { + let clamped = max(range.lowerBound, min(range.upperBound, value)) + return Double((clamped - range.lowerBound) / (range.upperBound - range.lowerBound)) + } + + private func expressionColor(for index: Int) -> Color { + let colors: [Color] = [.red, .orange, .yellow, .green, .blue, .purple, .pink, .cyan, .mint, .teal] + return colors[index % colors.count] + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/Face3DDemoApp.swift b/creative_apps/Face3DDemo/Face3DDemo/Face3DDemoApp.swift new file mode 100644 index 0000000..2961eab --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/Face3DDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct Face3DDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/Info.plist b/creative_apps/Face3DDemo/Face3DDemo/Info.plist new file mode 100644 index 0000000..e52a6b5 --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSCameraUsageDescription + Camera access is needed to capture face images for 3D reconstruction. + NSPhotoLibraryUsageDescription + Photo library access is needed to select face images for 3D reconstruction. + + diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj b/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..60bc77e --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj @@ -0,0 +1,286 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + C10001 /* LivePortraitDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10002; }; + C10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10004; }; + C10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C10006; }; + C1LP02 /* LivePortrait_MotionExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP01; }; + C1LP04 /* LivePortrait_AppearanceExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP03; }; + C1LP06 /* LivePortrait_WarpingNetwork.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP05; }; + C1LP08 /* LivePortrait_SPADEGenerator.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP07; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + C10007 /* LivePortraitDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LivePortraitDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + C10002 /* LivePortraitDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LivePortraitDemoApp.swift; sourceTree = ""; }; + C10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + C10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + C10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + C1LP01 /* LivePortrait_MotionExtractor.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_MotionExtractor.mlpackage; sourceTree = ""; }; + C1LP03 /* LivePortrait_AppearanceExtractor.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_AppearanceExtractor.mlpackage; sourceTree = ""; }; + C1LP05 /* LivePortrait_WarpingNetwork.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_WarpingNetwork.mlpackage; sourceTree = ""; }; + C1LP07 /* LivePortrait_SPADEGenerator.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_SPADEGenerator.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + C10009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + C10010 = { + isa = PBXGroup; + children = ( + C10011 /* LivePortraitDemo */, + C10012 /* Products */, + ); + sourceTree = ""; + }; + C10011 /* LivePortraitDemo */ = { + isa = PBXGroup; + children = ( + C10002 /* LivePortraitDemoApp.swift */, + C10004 /* ContentView.swift */, + C10006 /* Assets.xcassets */, + C10008 /* Info.plist */, + C1LP01 /* LivePortrait_MotionExtractor.mlpackage */, + C1LP03 /* LivePortrait_AppearanceExtractor.mlpackage */, + C1LP05 /* LivePortrait_WarpingNetwork.mlpackage */, + C1LP07 /* LivePortrait_SPADEGenerator.mlpackage */, + ); + path = LivePortraitDemo; + sourceTree = ""; + }; + C10012 /* Products */ = { + isa = PBXGroup; + children = ( + C10007 /* LivePortraitDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + C10013 /* LivePortraitDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = C10014; + buildPhases = ( + C10015 /* Sources */, + C10009 /* Frameworks */, + C10016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = LivePortraitDemo; + productName = LivePortraitDemo; + productReference = C10007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + C10017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + C10013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = C10018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = C10010; + productRefGroup = C10012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + C10013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + C10016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C10005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + C10015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C10001 /* LivePortraitDemoApp.swift in Sources */, + C10003 /* ContentView.swift in Sources */, + C1LP02 /* LivePortrait_MotionExtractor.mlpackage in Sources */, + C1LP04 /* LivePortrait_AppearanceExtractor.mlpackage in Sources */, + C1LP06 /* LivePortrait_WarpingNetwork.mlpackage in Sources */, + C1LP08 /* LivePortrait_SPADEGenerator.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + C10019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + C10020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + C10021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LivePortraitDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.liveportraitdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + C10022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LivePortraitDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.liveportraitdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + C10018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C10019, + C10020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C10014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C10021, + C10022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = C10017; +} diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/Contents.json b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift b/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift new file mode 100644 index 0000000..d2a9a6b --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift @@ -0,0 +1,519 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI +import AVKit + +// MARK: - LivePortrait: Portrait Animation via Multi-Model Pipeline +// +// Pipeline stages: +// 1. MotionExtractor - Extracts 3D motion parameters (pitch, yaw, roll, expression, translation) +// from each driving video frame +// 2. AppearanceExtractor - Extracts appearance features from the source portrait +// 3. WarpingNetwork - Warps source appearance using motion deltas between source and driving +// 4. SPADEGenerator - Generates the final animated frame from warped features +// +// Each model is loaded independently and run in sequence for each frame. + +// MARK: - Pipeline Stage Model + +enum PipelineStage: String, CaseIterable, Identifiable { + case motionExtractor = "Motion Extractor" + case appearanceExtractor = "Appearance Extractor" + case warpingNetwork = "Warping Network" + case spadeGenerator = "SPADE Generator" + + var id: String { rawValue } + + var modelFileName: String { + switch self { + case .motionExtractor: return "LivePortrait_MotionExtractor" + case .appearanceExtractor: return "LivePortrait_AppearanceExtractor" + case .warpingNetwork: return "LivePortrait_WarpingNetwork" + case .spadeGenerator: return "LivePortrait_SPADEGenerator" + } + } + + var description: String { + switch self { + case .motionExtractor: + return "Extracts 3D motion parameters (rotation, expression, translation) from face images" + case .appearanceExtractor: + return "Extracts identity-preserving appearance features from the source portrait" + case .warpingNetwork: + return "Warps source appearance features according to driving motion parameters" + case .spadeGenerator: + return "Generates the final animated frame using SPADE normalization" + } + } + + var icon: String { + switch self { + case .motionExtractor: return "arrow.triangle.branch" + case .appearanceExtractor: return "person.crop.rectangle" + case .warpingNetwork: return "wand.and.rays" + case .spadeGenerator: return "paintbrush.pointed.fill" + } + } +} + +enum StageStatus: Equatable { + case pending + case running + case completed + case failed(String) + + var color: Color { + switch self { + case .pending: return .gray + case .running: return .orange + case .completed: return .green + case .failed: return .red + } + } +} + +struct ContentView: View { + @StateObject private var viewModel = LivePortraitViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Source portrait picker + Section { + PhotosPicker(selection: $viewModel.selectedSourcePhoto, + matching: .images) { + if let image = viewModel.sourceImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 200) + .cornerRadius(12) + } else { + placeholderView( + title: "Select Source Portrait", + systemImage: "person.crop.square" + ) + } + } + } header: { + sectionHeader("Source Portrait") + } + + // Driving video picker + Section { + PhotosPicker(selection: $viewModel.selectedDrivingVideo, + matching: .videos) { + if viewModel.drivingVideoURL != nil { + HStack { + Image(systemName: "video.fill") + .font(.title2) + .foregroundColor(.accentColor) + VStack(alignment: .leading) { + Text("Driving Video Selected") + .font(.headline) + Text("Tap to change") + .font(.caption) + .foregroundColor(.secondary) + } + Spacer() + if let thumb = viewModel.drivingThumbnail { + Image(uiImage: thumb) + .resizable() + .scaledToFill() + .frame(width: 60, height: 60) + .cornerRadius(8) + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + } else { + placeholderView( + title: "Select Driving Video", + systemImage: "video.badge.plus" + ) + } + } + } header: { + sectionHeader("Driving Video") + } + + // Animate button + if viewModel.sourceImage != nil && viewModel.drivingVideoURL != nil { + Button(action: { viewModel.runPipeline() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "play.fill") + } + Text(viewModel.isProcessing ? "Processing..." : "Animate Portrait") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isProcessing ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isProcessing) + } + + // Pipeline status display + Section { + VStack(spacing: 12) { + ForEach(PipelineStage.allCases) { stage in + PipelineStageRow( + stage: stage, + status: viewModel.stageStatuses[stage] ?? .pending + ) + } + } + } header: { + sectionHeader("Pipeline Stages") + } + + // Error display + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // Result display + if let result = viewModel.resultImage { + Section { + VStack(spacing: 12) { + Image(uiImage: result) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + + Text("Animated portrait result (single frame preview)") + .font(.caption) + .foregroundColor(.secondary) + } + } header: { + sectionHeader("Animated Result") + } + } + } + .padding() + } + .navigationTitle("LivePortrait") + } + } + + private func sectionHeader(_ title: String) -> some View { + HStack { + Text(title) + .font(.headline) + Spacer() + } + } + + private func placeholderView(title: String, systemImage: String) -> some View { + VStack(spacing: 12) { + Image(systemName: systemImage) + .font(.system(size: 40)) + .foregroundColor(.secondary) + Text(title) + .foregroundColor(.secondary) + } + .frame(maxWidth: .infinity) + .frame(height: 160) + .background(Color(.systemGray6)) + .cornerRadius(12) + } +} + +// MARK: - Pipeline Stage Row + +struct PipelineStageRow: View { + let stage: PipelineStage + let status: StageStatus + + var body: some View { + HStack(spacing: 12) { + // Status indicator + ZStack { + Circle() + .fill(status.color.opacity(0.2)) + .frame(width: 36, height: 36) + if case .running = status { + ProgressView() + .scaleEffect(0.7) + } else { + Image(systemName: statusIcon) + .font(.caption) + .foregroundColor(status.color) + } + } + + VStack(alignment: .leading, spacing: 2) { + HStack { + Image(systemName: stage.icon) + .font(.caption) + Text(stage.rawValue) + .font(.subheadline) + .fontWeight(.medium) + } + Text(stage.description) + .font(.caption2) + .foregroundColor(.secondary) + .lineLimit(2) + + if case .failed(let msg) = status { + Text(msg) + .font(.caption2) + .foregroundColor(.red) + } + } + + Spacer() + } + .padding(10) + .background( + RoundedRectangle(cornerRadius: 10) + .fill(Color(.systemGray6)) + ) + } + + private var statusIcon: String { + switch status { + case .pending: return "circle" + case .running: return "arrow.clockwise" + case .completed: return "checkmark.circle.fill" + case .failed: return "xmark.circle.fill" + } + } +} + +// MARK: - ViewModel + +class LivePortraitViewModel: ObservableObject { + @Published var selectedSourcePhoto: PhotosPickerItem? { + didSet { loadSourceImage() } + } + @Published var selectedDrivingVideo: PhotosPickerItem? { + didSet { loadDrivingVideo() } + } + @Published var sourceImage: UIImage? + @Published var drivingVideoURL: URL? + @Published var drivingThumbnail: UIImage? + @Published var resultImage: UIImage? + @Published var isProcessing = false + @Published var errorMessage: String? + @Published var stageStatuses: [PipelineStage: StageStatus] = [:] + + init() { + // Initialize all stages as pending + for stage in PipelineStage.allCases { + stageStatuses[stage] = .pending + } + } + + private func loadSourceImage() { + guard let item = selectedSourcePhoto else { return } + Task { + if let data = try? await item.loadTransferable(type: Data.self), + let image = UIImage(data: data) { + await MainActor.run { + self.sourceImage = image + self.resultImage = nil + self.resetStages() + } + } + } + } + + private func loadDrivingVideo() { + guard let item = selectedDrivingVideo else { return } + Task { + // Load video as a Movie transferable + if let videoData = try? await item.loadTransferable(type: Data.self) { + let tempURL = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString) + .appendingPathExtension("mov") + try? videoData.write(to: tempURL) + + // Generate thumbnail + let asset = AVURLAsset(url: tempURL) + let generator = AVAssetImageGenerator(asset: asset) + generator.appliesPreferredTrackTransform = true + let cgImage = try? generator.copyCGImage(at: .zero, actualTime: nil) + + await MainActor.run { + self.drivingVideoURL = tempURL + self.drivingThumbnail = cgImage.map { UIImage(cgImage: $0) } + self.resultImage = nil + self.resetStages() + } + } + } + } + + private func resetStages() { + for stage in PipelineStage.allCases { + stageStatuses[stage] = .pending + } + } + + func runPipeline() { + guard sourceImage != nil, drivingVideoURL != nil else { return } + isProcessing = true + errorMessage = nil + resetStages() + + Task { + do { + try await executePipeline() + await MainActor.run { + self.isProcessing = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + } + } + + private func executePipeline() async throws { + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + + // Stage 1: Motion Extractor + await setStageStatus(.motionExtractor, .running) + do { + guard let modelURL = Bundle.main.url( + forResource: PipelineStage.motionExtractor.modelFileName, + withExtension: "mlmodelc" + ) else { + throw LivePortraitError.modelNotFound( + "\(PipelineStage.motionExtractor.modelFileName).mlmodelc not found. " + + "Add the compiled model to the project." + ) + } + let _ = try MLModel(contentsOf: modelURL, configuration: config) + + // In production: extract motion params from each driving video frame + // Output: pitch, yaw, roll, expression coefficients, translation vectors + await setStageStatus(.motionExtractor, .completed) + } catch { + await setStageStatus(.motionExtractor, .failed(error.localizedDescription)) + throw error + } + + // Stage 2: Appearance Extractor + await setStageStatus(.appearanceExtractor, .running) + do { + guard let modelURL = Bundle.main.url( + forResource: PipelineStage.appearanceExtractor.modelFileName, + withExtension: "mlmodelc" + ) else { + throw LivePortraitError.modelNotFound( + "\(PipelineStage.appearanceExtractor.modelFileName).mlmodelc not found. " + + "Add the compiled model to the project." + ) + } + let _ = try MLModel(contentsOf: modelURL, configuration: config) + + // In production: extract appearance feature volume from source portrait + // This is done once and reused for all frames + await setStageStatus(.appearanceExtractor, .completed) + } catch { + await setStageStatus(.appearanceExtractor, .failed(error.localizedDescription)) + throw error + } + + // Stage 3: Warping Network + await setStageStatus(.warpingNetwork, .running) + do { + guard let modelURL = Bundle.main.url( + forResource: PipelineStage.warpingNetwork.modelFileName, + withExtension: "mlmodelc" + ) else { + throw LivePortraitError.modelNotFound( + "\(PipelineStage.warpingNetwork.modelFileName).mlmodelc not found. " + + "Add the compiled model to the project." + ) + } + let _ = try MLModel(contentsOf: modelURL, configuration: config) + + // In production: warp source appearance features using + // the delta between source and driving motion parameters + await setStageStatus(.warpingNetwork, .completed) + } catch { + await setStageStatus(.warpingNetwork, .failed(error.localizedDescription)) + throw error + } + + // Stage 4: SPADE Generator + await setStageStatus(.spadeGenerator, .running) + do { + guard let modelURL = Bundle.main.url( + forResource: PipelineStage.spadeGenerator.modelFileName, + withExtension: "mlmodelc" + ) else { + throw LivePortraitError.modelNotFound( + "\(PipelineStage.spadeGenerator.modelFileName).mlmodelc not found. " + + "Add the compiled model to the project." + ) + } + let _ = try MLModel(contentsOf: modelURL, configuration: config) + + // In production: generate final animated frame from warped features + // using SPADE (Spatially-Adaptive Normalization) decoder + + // For demo, use the source image as placeholder result + await MainActor.run { + self.resultImage = self.sourceImage + } + await setStageStatus(.spadeGenerator, .completed) + } catch { + await setStageStatus(.spadeGenerator, .failed(error.localizedDescription)) + throw error + } + } + + @MainActor + private func setStageStatus(_ stage: PipelineStage, _ status: StageStatus) { + stageStatuses[stage] = status + } +} + +enum LivePortraitError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +// MARK: - UIImage Extension + +extension UIImage { + func resized(to targetSize: CGSize) -> UIImage? { + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + self.draw(in: CGRect(origin: .zero, size: targetSize)) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/Info.plist b/creative_apps/LivePortraitDemo/LivePortraitDemo/Info.plist new file mode 100644 index 0000000..c419912 --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select source portraits and driving videos. + NSCameraUsageDescription + This app may use the camera to capture driving video for portrait animation. + + diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/LivePortraitDemoApp.swift b/creative_apps/LivePortraitDemo/LivePortraitDemo/LivePortraitDemoApp.swift new file mode 100644 index 0000000..12b1d97 --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/LivePortraitDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct LivePortraitDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/MotionMagDemo/MotionMagDemo.xcodeproj/project.pbxproj b/creative_apps/MotionMagDemo/MotionMagDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..7724c05 --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo.xcodeproj/project.pbxproj @@ -0,0 +1,272 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + A10000010000000000000001 /* MotionMagDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000001; }; + A10000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000002; }; + A10000010000000000000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000003; }; + A1000001000000000000A001 /* STB_VMM_MotionMag.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = A1000002000000000000A001; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + A10000020000000000000000 /* MotionMagDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MotionMagDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + A10000020000000000000001 /* MotionMagDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MotionMagDemoApp.swift; sourceTree = ""; }; + A10000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + A10000020000000000000003 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + A10000020000000000000004 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + A1000002000000000000A001 /* STB_VMM_MotionMag.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = STB_VMM_MotionMag.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + A10000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + A10000040000000000000000 = { + isa = PBXGroup; + children = ( + A10000040000000000000001 /* MotionMagDemo */, + A10000040000000000000002 /* Products */, + ); + sourceTree = ""; + }; + A10000040000000000000001 /* MotionMagDemo */ = { + isa = PBXGroup; + children = ( + A10000020000000000000001 /* MotionMagDemoApp.swift */, + A10000020000000000000002 /* ContentView.swift */, + A10000020000000000000003 /* Assets.xcassets */, + A10000020000000000000004 /* Info.plist */, + A1000002000000000000A001 /* STB_VMM_MotionMag.mlpackage */, + ); + path = MotionMagDemo; + sourceTree = ""; + }; + A10000040000000000000002 /* Products */ = { + isa = PBXGroup; + children = ( + A10000020000000000000000 /* MotionMagDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + A10000050000000000000001 /* MotionMagDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = A10000070000000000000001; + buildPhases = ( + A10000060000000000000001 /* Sources */, + A10000030000000000000001 /* Frameworks */, + A10000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = MotionMagDemo; + productName = MotionMagDemo; + productReference = A10000020000000000000000; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + A10000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + A10000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = A10000070000000000000002; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = A10000040000000000000000; + productRefGroup = A10000040000000000000002; + projectDirPath = ""; + projectRoot = ""; + targets = ( + A10000050000000000000001 /* MotionMagDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + A10000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000010000000000000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + A10000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000010000000000000001 /* MotionMagDemoApp.swift in Sources */, + A10000010000000000000002 /* ContentView.swift in Sources */, + A1000001000000000000A001 /* STB_VMM_MotionMag.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + A10000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + A10000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + A10000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MotionMagDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.motionmagdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + A10000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MotionMagDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.motionmagdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + A10000070000000000000001 /* Build configuration list for PBXNativeTarget "MotionMagDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000090000000000000003 /* Debug */, + A10000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + A10000070000000000000002 /* Build configuration list for PBXProject "MotionMagDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000090000000000000001 /* Debug */, + A10000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = A10000080000000000000001 /* Project object */; +} diff --git a/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/Contents.json b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/MotionMagDemo/MotionMagDemo/ContentView.swift b/creative_apps/MotionMagDemo/MotionMagDemo/ContentView.swift new file mode 100644 index 0000000..3f918a8 --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo/ContentView.swift @@ -0,0 +1,474 @@ +import SwiftUI +import UIKit +import CoreML +import AVFoundation +import PhotosUI + +// MARK: - Video Frame Extractor + +/// Extracts frames from a video asset at regular intervals +class VideoFrameExtractor { + let asset: AVAsset + + init(asset: AVAsset) { + self.asset = asset + } + + /// Extract frames at the given times (in seconds) + func extractFrames(count: Int) async throws -> [UIImage] { + let generator = AVAssetImageGenerator(asset: asset) + generator.appliesPreferredTrackTransform = true + generator.requestedTimeToleranceBefore = .zero + generator.requestedTimeToleranceAfter = .zero + + let duration = try await asset.load(.duration) + let totalSeconds = CMTimeGetSeconds(duration) + guard totalSeconds > 0 else { return [] } + + let interval = totalSeconds / Double(count + 1) + var frames: [UIImage] = [] + + for i in 1...count { + let time = CMTime(seconds: interval * Double(i), preferredTimescale: 600) + do { + let (cgImage, _) = try await generator.image(at: time) + frames.append(UIImage(cgImage: cgImage)) + } catch { + continue + } + } + return frames + } +} + +// MARK: - Motion Magnification Processor + +/// Processes pairs of frames through the STB_VMM MotionMag CoreML model +class MotionMagProcessor: ObservableObject { + @Published var isProcessing = false + @Published var originalFrames: [UIImage] = [] + @Published var magnifiedFrames: [UIImage] = [] + @Published var errorMessage: String? + + private var model: MLModel? + private let inputSize = 384 + + init() { + loadModel() + } + + private func loadModel() { + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + // Attempt to load compiled model from bundle + guard let modelURL = Bundle.main.url(forResource: "STB_VMM_MotionMag", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Please add STB_VMM_MotionMag.mlmodelc to the project bundle." + return + } + model = try MLModel(contentsOf: modelURL, configuration: config) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + /// Preprocess a UIImage to a normalized pixel buffer (3 channels, 384x384) + private func preprocessImage(_ image: UIImage) -> [Float]? { + guard let cgImage = image.cgImage else { return nil } + + let width = inputSize + let height = inputSize + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: width * height * 4) + + guard let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + // Convert to float [0, 1] in CHW format + var floatData = [Float](repeating: 0, count: 3 * width * height) + for y in 0.. UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + for y in 0..= 2 else { + await MainActor.run { errorMessage = "Need at least 2 frames." } + return + } + + for i in 0..<(frames.count - 1) { + await processFramePair(frameA: frames[i], frameB: frames[i + 1], magnification: magnification) + } + } +} + +// MARK: - Video Picker + +struct VideoPicker: UIViewControllerRepresentable { + @Binding var videoURL: URL? + + func makeUIViewController(context: Context) -> PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .videos + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: VideoPicker + + init(_ parent: VideoPicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.hasItemConformingToTypeIdentifier("public.movie") else { return } + + provider.loadFileRepresentation(forTypeIdentifier: "public.movie") { url, error in + guard let url = url else { return } + // Copy to temporary location + let tempURL = FileManager.default.temporaryDirectory.appendingPathComponent(url.lastPathComponent) + try? FileManager.default.removeItem(at: tempURL) + try? FileManager.default.copyItem(at: url, to: tempURL) + DispatchQueue.main.async { + self.parent.videoURL = tempURL + } + } + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = MotionMagProcessor() + @State private var magnification: Double = 10.0 + @State private var showVideoPicker = false + @State private var videoURL: URL? + @State private var selectedPairIndex = 0 + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Video picker button + Button { + showVideoPicker = true + } label: { + Label("Select Video", systemImage: "video.badge.plus") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(12) + } + .padding(.horizontal) + + // Magnification slider + magnificationControl + + // Process button + if videoURL != nil && !processor.isProcessing { + Button { + processSelectedVideo() + } label: { + Label("Magnify Motion", systemImage: "waveform.path.ecg") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.green) + .foregroundColor(.white) + .cornerRadius(12) + } + .padding(.horizontal) + } + + // Processing indicator + if processor.isProcessing { + ProgressView("Processing frames...") + .padding() + } + + // Results comparison + if !processor.originalFrames.isEmpty && !processor.magnifiedFrames.isEmpty { + resultsSection + } + } + .padding(.vertical) + } + .navigationTitle("Motion Magnification") + .sheet(isPresented: $showVideoPicker) { + VideoPicker(videoURL: $videoURL) + } + } + } + + // MARK: - Subviews + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "waveform.path.ecg.rectangle") + .font(.system(size: 50)) + .foregroundColor(.blue) + Text("Video Motion Magnification") + .font(.title2.bold()) + Text("Amplify subtle motions in video using STB-VMM") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + private func errorBanner(_ message: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + private var magnificationControl: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Magnification Factor") + .font(.headline) + Spacer() + Text("\(Int(magnification))x") + .font(.title3.bold()) + .foregroundColor(.blue) + } + Slider(value: $magnification, in: 1...50, step: 1) + .tint(.blue) + HStack { + Text("1x") + .font(.caption) + .foregroundColor(.secondary) + Spacer() + Text("50x") + .font(.caption) + .foregroundColor(.secondary) + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + .padding(.horizontal) + } + + private var resultsSection: some View { + VStack(spacing: 16) { + Text("Results") + .font(.title3.bold()) + + // Frame pair selector + if processor.magnifiedFrames.count > 1 { + Picker("Frame Pair", selection: $selectedPairIndex) { + ForEach(0.. + + + + NSCameraUsageDescription + Camera access is needed to record video for motion magnification. + NSPhotoLibraryUsageDescription + Photo library access is needed to select videos for motion magnification. + + diff --git a/creative_apps/MotionMagDemo/MotionMagDemo/MotionMagDemoApp.swift b/creative_apps/MotionMagDemo/MotionMagDemo/MotionMagDemoApp.swift new file mode 100644 index 0000000..adf641d --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo/MotionMagDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct MotionMagDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo.xcodeproj/project.pbxproj b/creative_apps/NAFNetDemo/NAFNetDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..4e406f4 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo.xcodeproj/project.pbxproj @@ -0,0 +1,272 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + B20000010000000000000001 /* NAFNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000001; }; + B20000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000002; }; + B20000010000000000000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000003; }; + B2000001000000000000B001 /* NAFNet_Deblur.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B2000002000000000000B001; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + B20000020000000000000000 /* NAFNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = NAFNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + B20000020000000000000001 /* NAFNetDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NAFNetDemoApp.swift; sourceTree = ""; }; + B20000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + B20000020000000000000003 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + B20000020000000000000004 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + B2000002000000000000B001 /* NAFNet_Deblur.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = NAFNet_Deblur.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + B20000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + B20000040000000000000000 = { + isa = PBXGroup; + children = ( + B20000040000000000000001 /* NAFNetDemo */, + B20000040000000000000002 /* Products */, + ); + sourceTree = ""; + }; + B20000040000000000000001 /* NAFNetDemo */ = { + isa = PBXGroup; + children = ( + B20000020000000000000001 /* NAFNetDemoApp.swift */, + B20000020000000000000002 /* ContentView.swift */, + B20000020000000000000003 /* Assets.xcassets */, + B20000020000000000000004 /* Info.plist */, + B2000002000000000000B001 /* NAFNet_Deblur.mlpackage */, + ); + path = NAFNetDemo; + sourceTree = ""; + }; + B20000040000000000000002 /* Products */ = { + isa = PBXGroup; + children = ( + B20000020000000000000000 /* NAFNetDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + B20000050000000000000001 /* NAFNetDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = B20000070000000000000001; + buildPhases = ( + B20000060000000000000001 /* Sources */, + B20000030000000000000001 /* Frameworks */, + B20000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = NAFNetDemo; + productName = NAFNetDemo; + productReference = B20000020000000000000000; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + B20000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + B20000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = B20000070000000000000002; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = B20000040000000000000000; + productRefGroup = B20000040000000000000002; + projectDirPath = ""; + projectRoot = ""; + targets = ( + B20000050000000000000001 /* NAFNetDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + B20000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000010000000000000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + B20000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000010000000000000001 /* NAFNetDemoApp.swift in Sources */, + B20000010000000000000002 /* ContentView.swift in Sources */, + B2000001000000000000B001 /* NAFNet_Deblur.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + B20000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + B20000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + B20000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = NAFNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.nafnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + B20000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = NAFNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.nafnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + B20000070000000000000001 /* Build configuration list for PBXNativeTarget "NAFNetDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000090000000000000003 /* Debug */, + B20000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + B20000070000000000000002 /* Build configuration list for PBXProject "NAFNetDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000090000000000000001 /* Debug */, + B20000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = B20000080000000000000001 /* Project object */; +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/Contents.json b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/ContentView.swift b/creative_apps/NAFNetDemo/NAFNetDemo/ContentView.swift new file mode 100644 index 0000000..1043e75 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/ContentView.swift @@ -0,0 +1,462 @@ +import SwiftUI +import UIKit +import CoreML +import PhotosUI + +// MARK: - NAFNet Deblurring Processor + +/// Handles image deblurring using the NAFNet CoreML model +class DeblurProcessor: ObservableObject { + @Published var inputImage: UIImage? + @Published var outputImage: UIImage? + @Published var isProcessing = false + @Published var errorMessage: String? + @Published var inferenceTime: Double = 0 + + private var model: MLModel? + private let inputSize = 256 + + init() { + loadModel() + } + + private func loadModel() { + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + guard let modelURL = Bundle.main.url(forResource: "NAFNet_Deblur", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Please add NAFNet_Deblur.mlmodelc to the project bundle." + return + } + model = try MLModel(contentsOf: modelURL, configuration: config) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + /// Convert UIImage to CHW float array normalized to [0, 1] + private func imageToFloatArray(_ image: UIImage) -> [Float]? { + guard let cgImage = image.cgImage else { return nil } + + let width = inputSize + let height = inputSize + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: width * height * 4) + + guard let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + var floatData = [Float](repeating: 0, count: 3 * width * height) + for y in 0.. UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + for y in 0.. Path { + var path = Path() + path.addRect(CGRect(x: 0, y: 0, width: rect.width * position, height: rect.height)) + return path + } +} + +// MARK: - Image Picker + +struct ImagePicker: UIViewControllerRepresentable { + @Binding var image: UIImage? + + func makeUIViewController(context: Context) -> PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .images + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: ImagePicker + + init(_ parent: ImagePicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.canLoadObject(ofClass: UIImage.self) else { return } + provider.loadObject(ofClass: UIImage.self) { image, _ in + DispatchQueue.main.async { + self.parent.image = image as? UIImage + } + } + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = DeblurProcessor() + @State private var showImagePicker = false + @State private var selectedImage: UIImage? + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Pick image button + Button { + showImagePicker = true + } label: { + Label("Pick Blurry Photo", systemImage: "photo.badge.plus") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(12) + } + .padding(.horizontal) + + // Processing indicator + if processor.isProcessing { + ProgressView("Deblurring image...") + .padding() + } + + // Inference time + if processor.inferenceTime > 0 { + HStack { + Image(systemName: "clock") + .foregroundColor(.orange) + Text(String(format: "Inference time: %.1f ms", processor.inferenceTime)) + .font(.subheadline.bold()) + .foregroundColor(.orange) + } + .padding(.horizontal) + } + + // Comparison view + if let input = processor.inputImage, let output = processor.outputImage { + VStack(spacing: 8) { + Text("Drag to Compare") + .font(.headline) + SliderComparisonView(beforeImage: input, afterImage: output) + .frame(height: 300) + .cornerRadius(12) + .padding(.horizontal) + } + } else if let input = processor.inputImage { + // Show just the input if no output yet + VStack(spacing: 8) { + Text("Input Image") + .font(.headline) + Image(uiImage: input) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + .padding(.horizontal) + } + } + + // Side by side view + if let input = processor.inputImage, let output = processor.outputImage { + VStack(spacing: 8) { + Text("Side by Side") + .font(.headline) + HStack(spacing: 8) { + VStack { + Text("Before") + .font(.caption.bold()) + .foregroundColor(.secondary) + Image(uiImage: input) + .resizable() + .scaledToFit() + .cornerRadius(8) + } + VStack { + Text("After") + .font(.caption.bold()) + .foregroundColor(.secondary) + Image(uiImage: output) + .resizable() + .scaledToFit() + .cornerRadius(8) + } + } + .padding(.horizontal) + } + } + + Spacer(minLength: 40) + } + .padding(.vertical) + } + .navigationTitle("NAFNet Deblur") + .sheet(isPresented: $showImagePicker) { + ImagePicker(image: $selectedImage) + } + .onChange(of: selectedImage) { newValue in + guard let image = newValue else { return } + Task { + await processor.deblur(image: image) + } + } + } + } + + // MARK: - Subviews + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "camera.filters") + .font(.system(size: 50)) + .foregroundColor(.blue) + Text("Image Deblurring") + .font(.title2.bold()) + Text("Remove blur from photos using NAFNet neural network") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + private func errorBanner(_ message: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/Info.plist b/creative_apps/NAFNetDemo/NAFNetDemo/Info.plist new file mode 100644 index 0000000..e22fd04 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + Photo library access is needed to select images for deblurring. + + diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/NAFNetDemoApp.swift b/creative_apps/NAFNetDemo/NAFNetDemo/NAFNetDemoApp.swift new file mode 100644 index 0000000..4908406 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/NAFNetDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct NAFNetDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/RelightDemo/RelightDemo.xcodeproj/project.pbxproj b/creative_apps/RelightDemo/RelightDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..ccaa1ef --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo.xcodeproj/project.pbxproj @@ -0,0 +1,272 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + E50000010000000000000001 /* RelightDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000001; }; + E50000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000002; }; + E50000010000000000000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000003; }; + E5000001000000000000E001 /* DPR_Relighting.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = E5000002000000000000E001; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + E50000020000000000000000 /* RelightDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = RelightDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + E50000020000000000000001 /* RelightDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RelightDemoApp.swift; sourceTree = ""; }; + E50000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + E50000020000000000000003 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + E50000020000000000000004 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + E5000002000000000000E001 /* DPR_Relighting.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = DPR_Relighting.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + E50000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + E50000040000000000000000 = { + isa = PBXGroup; + children = ( + E50000040000000000000001 /* RelightDemo */, + E50000040000000000000002 /* Products */, + ); + sourceTree = ""; + }; + E50000040000000000000001 /* RelightDemo */ = { + isa = PBXGroup; + children = ( + E50000020000000000000001 /* RelightDemoApp.swift */, + E50000020000000000000002 /* ContentView.swift */, + E50000020000000000000003 /* Assets.xcassets */, + E50000020000000000000004 /* Info.plist */, + E5000002000000000000E001 /* DPR_Relighting.mlpackage */, + ); + path = RelightDemo; + sourceTree = ""; + }; + E50000040000000000000002 /* Products */ = { + isa = PBXGroup; + children = ( + E50000020000000000000000 /* RelightDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + E50000050000000000000001 /* RelightDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = E50000070000000000000001; + buildPhases = ( + E50000060000000000000001 /* Sources */, + E50000030000000000000001 /* Frameworks */, + E50000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = RelightDemo; + productName = RelightDemo; + productReference = E50000020000000000000000; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + E50000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + E50000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = E50000070000000000000002; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = E50000040000000000000000; + productRefGroup = E50000040000000000000002; + projectDirPath = ""; + projectRoot = ""; + targets = ( + E50000050000000000000001 /* RelightDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + E50000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000010000000000000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + E50000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000010000000000000001 /* RelightDemoApp.swift in Sources */, + E50000010000000000000002 /* ContentView.swift in Sources */, + E5000001000000000000E001 /* DPR_Relighting.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + E50000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + E50000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + E50000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = RelightDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.relightdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + E50000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = RelightDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.relightdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + E50000070000000000000001 /* Build configuration list for PBXNativeTarget "RelightDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000090000000000000003 /* Debug */, + E50000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + E50000070000000000000002 /* Build configuration list for PBXProject "RelightDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000090000000000000001 /* Debug */, + E50000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = E50000080000000000000001 /* Project object */; +} diff --git a/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/Contents.json b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/RelightDemo/RelightDemo/ContentView.swift b/creative_apps/RelightDemo/RelightDemo/ContentView.swift new file mode 100644 index 0000000..8c6e9b6 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/ContentView.swift @@ -0,0 +1,697 @@ +import SwiftUI +import UIKit +import CoreML +import PhotosUI + +// MARK: - Spherical Harmonics Lighting Presets + +/// Preset SH lighting coefficients (9 coefficients for 2nd order SH) +struct SHLightingPreset: Identifiable, Equatable { + let id = UUID() + let name: String + let icon: String + let coefficients: [Float] // 9 SH coefficients + + static func == (lhs: SHLightingPreset, rhs: SHLightingPreset) -> Bool { + lhs.id == rhs.id + } + + /// Preset lighting directions using 2nd-order Spherical Harmonics + /// SH basis: [Y00, Y1-1, Y10, Y11, Y2-2, Y2-1, Y20, Y21, Y22] + static let front = SHLightingPreset( + name: "Front", + icon: "sun.max.fill", + coefficients: [0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + ) + + static let left = SHLightingPreset( + name: "Left", + icon: "arrow.left.circle.fill", + coefficients: [0.5, 0.0, 0.0, -0.6, 0.0, 0.0, 0.0, 0.0, 0.3] + ) + + static let right = SHLightingPreset( + name: "Right", + icon: "arrow.right.circle.fill", + coefficients: [0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.3] + ) + + static let top = SHLightingPreset( + name: "Top", + icon: "arrow.up.circle.fill", + coefficients: [0.5, 0.0, 0.6, 0.0, 0.0, 0.0, 0.3, 0.0, 0.0] + ) + + static let bottom = SHLightingPreset( + name: "Bottom", + icon: "arrow.down.circle.fill", + coefficients: [0.5, 0.0, -0.6, 0.0, 0.0, 0.0, 0.3, 0.0, 0.0] + ) + + static let allPresets: [SHLightingPreset] = [front, left, right, top, bottom] +} + +// MARK: - Relighting Processor + +/// Processes portrait images through the DPR Relighting CoreML model +class RelightProcessor: ObservableObject { + @Published var inputImage: UIImage? + @Published var luminanceImage: UIImage? + @Published var relitImage: UIImage? + @Published var isProcessing = false + @Published var errorMessage: String? + @Published var selectedPreset: SHLightingPreset = .front + @Published var customSH: [Float] = [0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + + private var model: MLModel? + private let inputSize = 512 + + init() { + loadModel() + } + + private func loadModel() { + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + guard let modelURL = Bundle.main.url(forResource: "DPR_Relighting", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Please add DPR_Relighting.mlmodelc to the project bundle." + return + } + model = try MLModel(contentsOf: modelURL, configuration: config) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + /// Convert color image to grayscale luminance + private func convertToLuminance(_ image: UIImage) -> (UIImage?, [Float]?) { + guard let cgImage = image.cgImage else { return (nil, nil) } + + let width = inputSize + let height = inputSize + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: width * height * 4) + + guard let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return (nil, nil) } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + // Compute luminance: Y = 0.299*R + 0.587*G + 0.114*B + var luminanceData = [Float](repeating: 0, count: width * height) + var grayPixels = [UInt8](repeating: 0, count: width * height * 4) + + for y in 0.. UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + for y in 0.. PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .images + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: ImagePicker + + init(_ parent: ImagePicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.canLoadObject(ofClass: UIImage.self) else { return } + provider.loadObject(ofClass: UIImage.self) { image, _ in + DispatchQueue.main.async { + self.parent.image = image as? UIImage + } + } + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = RelightProcessor() + @State private var showImagePicker = false + @State private var selectedImage: UIImage? + @State private var useCustomLighting = false + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Pick image button + Button { + showImagePicker = true + } label: { + Label("Select Portrait Photo", systemImage: "person.crop.rectangle") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.orange) + .foregroundColor(.white) + .cornerRadius(12) + } + .padding(.horizontal) + + // Lighting controls + lightingControlSection + + // Apply button + if processor.inputImage != nil && !processor.isProcessing { + Button { + applyRelighting() + } label: { + Label("Apply Relighting", systemImage: "light.max") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.yellow) + .foregroundColor(.black) + .cornerRadius(12) + } + .padding(.horizontal) + } + + // Processing indicator + if processor.isProcessing { + ProgressView("Relighting portrait...") + .padding() + } + + // Results + if processor.inputImage != nil || processor.relitImage != nil { + resultsSection + } + + // SH coefficient display + shCoefficientDisplay + + Spacer(minLength: 40) + } + .padding(.vertical) + } + .navigationTitle("Portrait Relight") + .sheet(isPresented: $showImagePicker) { + ImagePicker(image: $selectedImage) + } + .onChange(of: selectedImage) { newValue in + if let image = newValue { + processor.inputImage = image + } + } + } + } + + // MARK: - Subviews + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "light.beacon.max") + .font(.system(size: 50)) + .foregroundColor(.orange) + Text("Portrait Relighting") + .font(.title2.bold()) + Text("Change lighting direction on portraits using DPR model with Spherical Harmonics") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + private func errorBanner(_ message: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + private var lightingControlSection: some View { + VStack(spacing: 16) { + // Toggle between presets and custom + Picker("Lighting Mode", selection: $useCustomLighting) { + Text("Presets").tag(false) + Text("Custom").tag(true) + } + .pickerStyle(.segmented) + .padding(.horizontal) + + if useCustomLighting { + // Interactive sphere + VStack(spacing: 8) { + Text("Drag to Set Light Direction") + .font(.subheadline.bold()) + LightDirectionSphere(shCoefficients: $processor.customSH) + .frame(height: 200) + .padding(.horizontal, 60) + } + } else { + // Preset buttons + VStack(spacing: 8) { + Text("Lighting Presets") + .font(.subheadline.bold()) + + HStack(spacing: 12) { + ForEach(SHLightingPreset.allPresets) { preset in + Button { + processor.selectedPreset = preset + processor.customSH = preset.coefficients + } label: { + VStack(spacing: 4) { + Image(systemName: preset.icon) + .font(.title2) + Text(preset.name) + .font(.caption2) + } + .frame(maxWidth: .infinity) + .padding(.vertical, 12) + .background( + processor.selectedPreset == preset + ? Color.orange.opacity(0.2) + : Color(.systemGray6) + ) + .cornerRadius(10) + .overlay( + RoundedRectangle(cornerRadius: 10) + .stroke( + processor.selectedPreset == preset + ? Color.orange + : Color.clear, + lineWidth: 2 + ) + ) + } + .foregroundColor(.primary) + } + } + .padding(.horizontal) + } + } + } + } + + private var resultsSection: some View { + VStack(spacing: 16) { + // Original vs Relit comparison + HStack(spacing: 12) { + // Original + VStack(spacing: 4) { + Text("Original") + .font(.caption.bold()) + .foregroundColor(.secondary) + if let image = processor.inputImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .cornerRadius(8) + } + } + + // Relit + VStack(spacing: 4) { + Text("Relit") + .font(.caption.bold()) + .foregroundColor(.secondary) + if let image = processor.relitImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .cornerRadius(8) + } else { + Rectangle() + .fill(Color(.systemGray5)) + .overlay( + Text("Run relighting") + .font(.caption) + .foregroundColor(.secondary) + ) + .cornerRadius(8) + } + } + } + .padding(.horizontal) + + // Luminance intermediate + if let lumImage = processor.luminanceImage { + VStack(spacing: 4) { + Text("Luminance Input (512x512)") + .font(.caption.bold()) + .foregroundColor(.secondary) + Image(uiImage: lumImage) + .resizable() + .scaledToFit() + .frame(maxHeight: 200) + .cornerRadius(8) + } + .padding(.horizontal) + } + } + } + + private var shCoefficientDisplay: some View { + VStack(alignment: .leading, spacing: 8) { + Text("SH Coefficients") + .font(.headline) + + let labels = ["Y00 (ambient)", "Y1-1 (vertical)", "Y10 (depth)", "Y11 (horizontal)", + "Y2-2", "Y2-1", "Y20", "Y21", "Y22"] + + ForEach(0..<9, id: \.self) { i in + HStack { + Text(labels[i]) + .font(.system(size: 10, design: .monospaced)) + .frame(width: 110, alignment: .leading) + Slider( + value: Binding( + get: { Double(processor.customSH[i]) }, + set: { processor.customSH[i] = Float($0) } + ), + in: -1.0...1.0 + ) + .tint(.orange) + Text(String(format: "%.2f", processor.customSH[i])) + .font(.system(size: 10, design: .monospaced)) + .frame(width: 40, alignment: .trailing) + } + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + .padding(.horizontal) + } + + // MARK: - Actions + + private func applyRelighting() { + guard let image = processor.inputImage else { return } + let sh = useCustomLighting ? processor.customSH : processor.selectedPreset.coefficients + Task { + await processor.relight(image: image, shCoefficients: sh) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/RelightDemo/RelightDemo/Info.plist b/creative_apps/RelightDemo/RelightDemo/Info.plist new file mode 100644 index 0000000..b161998 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + Photo library access is needed to select portrait photos for relighting. + + diff --git a/creative_apps/RelightDemo/RelightDemo/RelightDemoApp.swift b/creative_apps/RelightDemo/RelightDemo/RelightDemoApp.swift new file mode 100644 index 0000000..38f1458 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/RelightDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct RelightDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo.xcodeproj/project.pbxproj b/creative_apps/SimSwapDemo/SimSwapDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..0668d7e --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo.xcodeproj/project.pbxproj @@ -0,0 +1,276 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + C30000010000000000000001 /* SimSwapDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000001; }; + C30000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000002; }; + C30000010000000000000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000003; }; + C3000001000000000000C001 /* SimSwap_ArcFace.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C3000002000000000000C001; }; + C3000001000000000000C002 /* SimSwap_Generator.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C3000002000000000000C002; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + C30000020000000000000000 /* SimSwapDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = SimSwapDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + C30000020000000000000001 /* SimSwapDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SimSwapDemoApp.swift; sourceTree = ""; }; + C30000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + C30000020000000000000003 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + C30000020000000000000004 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + C3000002000000000000C001 /* SimSwap_ArcFace.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = SimSwap_ArcFace.mlpackage; sourceTree = ""; }; + C3000002000000000000C002 /* SimSwap_Generator.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = SimSwap_Generator.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + C30000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + C30000040000000000000000 = { + isa = PBXGroup; + children = ( + C30000040000000000000001 /* SimSwapDemo */, + C30000040000000000000002 /* Products */, + ); + sourceTree = ""; + }; + C30000040000000000000001 /* SimSwapDemo */ = { + isa = PBXGroup; + children = ( + C30000020000000000000001 /* SimSwapDemoApp.swift */, + C30000020000000000000002 /* ContentView.swift */, + C30000020000000000000003 /* Assets.xcassets */, + C30000020000000000000004 /* Info.plist */, + C3000002000000000000C001 /* SimSwap_ArcFace.mlpackage */, + C3000002000000000000C002 /* SimSwap_Generator.mlpackage */, + ); + path = SimSwapDemo; + sourceTree = ""; + }; + C30000040000000000000002 /* Products */ = { + isa = PBXGroup; + children = ( + C30000020000000000000000 /* SimSwapDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + C30000050000000000000001 /* SimSwapDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = C30000070000000000000001; + buildPhases = ( + C30000060000000000000001 /* Sources */, + C30000030000000000000001 /* Frameworks */, + C30000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = SimSwapDemo; + productName = SimSwapDemo; + productReference = C30000020000000000000000; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + C30000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + C30000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = C30000070000000000000002; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = C30000040000000000000000; + productRefGroup = C30000040000000000000002; + projectDirPath = ""; + projectRoot = ""; + targets = ( + C30000050000000000000001 /* SimSwapDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + C30000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000010000000000000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + C30000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000010000000000000001 /* SimSwapDemoApp.swift in Sources */, + C30000010000000000000002 /* ContentView.swift in Sources */, + C3000001000000000000C001 /* SimSwap_ArcFace.mlpackage in Sources */, + C3000001000000000000C002 /* SimSwap_Generator.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + C30000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + C30000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + C30000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = SimSwapDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.simswapdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + C30000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = SimSwapDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.simswapdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + C30000070000000000000001 /* Build configuration list for PBXNativeTarget "SimSwapDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000090000000000000003 /* Debug */, + C30000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C30000070000000000000002 /* Build configuration list for PBXProject "SimSwapDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000090000000000000001 /* Debug */, + C30000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = C30000080000000000000001 /* Project object */; +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/Contents.json b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/ContentView.swift b/creative_apps/SimSwapDemo/SimSwapDemo/ContentView.swift new file mode 100644 index 0000000..60730ad --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/ContentView.swift @@ -0,0 +1,689 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - Pipeline Step + +/// Represents the current step in the face swap pipeline +enum PipelineStep: String, CaseIterable { + case idle = "Ready" + case detectingSourceFace = "Detecting source face..." + case extractingIdentity = "Extracting identity embedding..." + case detectingTargetFace = "Detecting target face..." + case generatingSwap = "Generating face swap..." + case complete = "Complete" + case error = "Error" +} + +// MARK: - Face Swap Processor + +/// Two-stage face swap pipeline: ArcFace embedding + SimSwap generator +class FaceSwapProcessor: ObservableObject { + @Published var sourceImage: UIImage? + @Published var targetImage: UIImage? + @Published var resultImage: UIImage? + @Published var isProcessing = false + @Published var currentStep: PipelineStep = .idle + @Published var errorMessage: String? + + // Cropped face images for display + @Published var sourceFaceCrop: UIImage? + @Published var targetFaceCrop: UIImage? + + private var arcFaceModel: MLModel? + private var generatorModel: MLModel? + + init() { + loadModels() + } + + private func loadModels() { + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + // Load ArcFace model + if let arcFaceURL = Bundle.main.url(forResource: "SimSwap_ArcFace", withExtension: "mlmodelc") { + arcFaceModel = try MLModel(contentsOf: arcFaceURL, configuration: config) + } else { + errorMessage = "ArcFace model not found. Please add SimSwap_ArcFace.mlmodelc to the bundle." + } + + // Load Generator model + if let genURL = Bundle.main.url(forResource: "SimSwap_Generator", withExtension: "mlmodelc") { + generatorModel = try MLModel(contentsOf: genURL, configuration: config) + } else { + let msg = "Generator model not found. Please add SimSwap_Generator.mlmodelc to the bundle." + errorMessage = errorMessage != nil ? "\(errorMessage!) \(msg)" : msg + } + } catch { + errorMessage = "Failed to load models: \(error.localizedDescription)" + } + } + + /// Detect the largest face in an image using Vision and return its bounding box + private func detectFace(in image: UIImage) async throws -> CGRect? { + guard let cgImage = image.cgImage else { return nil } + + return try await withCheckedThrowingContinuation { continuation in + let request = VNDetectFaceRectanglesRequest { request, error in + if let error = error { + continuation.resume(throwing: error) + return + } + // Get the largest face + let faces = request.results as? [VNFaceObservation] ?? [] + let largestFace = faces.max(by: { $0.boundingBox.width * $0.boundingBox.height < $1.boundingBox.width * $1.boundingBox.height }) + continuation.resume(returning: largestFace?.boundingBox) + } + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try handler.perform([request]) + } catch { + continuation.resume(throwing: error) + } + } + } + + /// Crop face region from image with some padding + private func cropFace(from image: UIImage, boundingBox: CGRect, targetSize: CGSize) -> UIImage? { + guard let cgImage = image.cgImage else { return nil } + + let imageWidth = CGFloat(cgImage.width) + let imageHeight = CGFloat(cgImage.height) + + // Convert Vision coordinates (origin at bottom-left) to CGImage coordinates (origin at top-left) + let x = boundingBox.origin.x * imageWidth + let y = (1.0 - boundingBox.origin.y - boundingBox.height) * imageHeight + let w = boundingBox.width * imageWidth + let h = boundingBox.height * imageHeight + + // Add 20% padding + let padding: CGFloat = 0.2 + let padX = w * padding + let padY = h * padding + let cropRect = CGRect( + x: max(0, x - padX), + y: max(0, y - padY), + width: min(imageWidth - max(0, x - padX), w + 2 * padX), + height: min(imageHeight - max(0, y - padY), h + 2 * padY) + ) + + guard let croppedCGImage = cgImage.cropping(to: cropRect) else { return nil } + + // Resize to target size + let renderer = UIGraphicsImageRenderer(size: targetSize) + let resized = renderer.image { _ in + UIImage(cgImage: croppedCGImage).draw(in: CGRect(origin: .zero, size: targetSize)) + } + return resized + } + + /// Convert UIImage to CHW float array + private func imageToFloatArray(_ image: UIImage, size: Int) -> [Float]? { + guard let cgImage = image.cgImage else { return nil } + + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: size * size * 4) + + guard let context = CGContext( + data: &pixelData, + width: size, + height: size, + bitsPerComponent: 8, + bytesPerRow: size * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + + var floatData = [Float](repeating: 0, count: 3 * size * size) + for y in 0.. UIImage? { + var pixelData = [UInt8](repeating: 255, count: size * size * 4) + for y in 0.. PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .images + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: ImagePicker + + init(_ parent: ImagePicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.canLoadObject(ofClass: UIImage.self) else { return } + provider.loadObject(ofClass: UIImage.self) { image, _ in + DispatchQueue.main.async { + self.parent.image = image as? UIImage + } + } + } + } +} + +// MARK: - Pipeline Step View + +struct PipelineStepRow: View { + let step: PipelineStep + let currentStep: PipelineStep + let allSteps: [PipelineStep] + + private var stepIndex: Int { allSteps.firstIndex(of: step) ?? 0 } + private var currentIndex: Int { allSteps.firstIndex(of: currentStep) ?? 0 } + + private var status: StepStatus { + if currentStep == .error && step == allSteps[currentIndex] { return .error } + if stepIndex < currentIndex { return .completed } + if stepIndex == currentIndex { return .active } + return .pending + } + + enum StepStatus { + case pending, active, completed, error + } + + var body: some View { + HStack(spacing: 12) { + ZStack { + Circle() + .fill(statusColor) + .frame(width: 28, height: 28) + statusIcon + } + Text(step.rawValue) + .font(.subheadline) + .foregroundColor(status == .pending ? .secondary : .primary) + Spacer() + if status == .active { + ProgressView() + .scaleEffect(0.8) + } + } + } + + private var statusColor: Color { + switch status { + case .pending: return Color(.systemGray4) + case .active: return .blue + case .completed: return .green + case .error: return .red + } + } + + @ViewBuilder + private var statusIcon: some View { + switch status { + case .pending: + Text("\(stepIndex + 1)") + .font(.caption2.bold()) + .foregroundColor(.white) + case .active: + Text("\(stepIndex + 1)") + .font(.caption2.bold()) + .foregroundColor(.white) + case .completed: + Image(systemName: "checkmark") + .font(.caption2.bold()) + .foregroundColor(.white) + case .error: + Image(systemName: "xmark") + .font(.caption2.bold()) + .foregroundColor(.white) + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = FaceSwapProcessor() + @State private var showSourcePicker = false + @State private var showTargetPicker = false + + private let pipelineSteps: [PipelineStep] = [ + .detectingSourceFace, + .extractingIdentity, + .detectingTargetFace, + .generatingSwap, + .complete + ] + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Image selection + imageSelectionSection + + // Run button + if processor.sourceImage != nil && processor.targetImage != nil && !processor.isProcessing { + Button { + Task { await processor.performFaceSwap() } + } label: { + Label("Swap Faces", systemImage: "arrow.triangle.swap") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.purple) + .foregroundColor(.white) + .cornerRadius(12) + } + .padding(.horizontal) + } + + // Pipeline visualization + if processor.currentStep != .idle { + pipelineSection + } + + // Face crops display + faceCropsSection + + // Result + if let result = processor.resultImage { + resultSection(result) + } + + Spacer(minLength: 40) + } + .padding(.vertical) + } + .navigationTitle("SimSwap Face Swap") + .sheet(isPresented: $showSourcePicker) { + ImagePicker(image: $processor.sourceImage) + } + .sheet(isPresented: $showTargetPicker) { + ImagePicker(image: $processor.targetImage) + } + } + } + + // MARK: - Subviews + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "person.2.crop.square.stack") + .font(.system(size: 50)) + .foregroundColor(.purple) + Text("Face Swap") + .font(.title2.bold()) + Text("Transfer identity from one face to another using SimSwap") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + private func errorBanner(_ message: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + private var imageSelectionSection: some View { + HStack(spacing: 12) { + // Source face button/preview + VStack(spacing: 8) { + Text("Source (Identity)") + .font(.caption.bold()) + .foregroundColor(.secondary) + + Button { + showSourcePicker = true + } label: { + if let image = processor.sourceImage { + Image(uiImage: image) + .resizable() + .scaledToFill() + .frame(width: 140, height: 140) + .clipShape(RoundedRectangle(cornerRadius: 12)) + } else { + VStack(spacing: 8) { + Image(systemName: "person.crop.rectangle") + .font(.title) + Text("Select") + .font(.caption) + } + .frame(width: 140, height: 140) + .background(Color(.systemGray5)) + .cornerRadius(12) + } + } + .foregroundColor(.primary) + } + + Image(systemName: "arrow.right") + .font(.title2) + .foregroundColor(.secondary) + + // Target face button/preview + VStack(spacing: 8) { + Text("Target (Pose)") + .font(.caption.bold()) + .foregroundColor(.secondary) + + Button { + showTargetPicker = true + } label: { + if let image = processor.targetImage { + Image(uiImage: image) + .resizable() + .scaledToFill() + .frame(width: 140, height: 140) + .clipShape(RoundedRectangle(cornerRadius: 12)) + } else { + VStack(spacing: 8) { + Image(systemName: "person.crop.rectangle") + .font(.title) + Text("Select") + .font(.caption) + } + .frame(width: 140, height: 140) + .background(Color(.systemGray5)) + .cornerRadius(12) + } + } + .foregroundColor(.primary) + } + } + .padding(.horizontal) + } + + private var pipelineSection: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Pipeline Progress") + .font(.headline) + + ForEach(pipelineSteps, id: \.self) { step in + PipelineStepRow(step: step, currentStep: processor.currentStep, allSteps: pipelineSteps) + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + .padding(.horizontal) + } + + private var faceCropsSection: some View { + Group { + if processor.sourceFaceCrop != nil || processor.targetFaceCrop != nil { + VStack(spacing: 8) { + Text("Detected Faces") + .font(.headline) + HStack(spacing: 16) { + if let crop = processor.sourceFaceCrop { + VStack { + Text("Source 112x112") + .font(.caption2) + .foregroundColor(.secondary) + Image(uiImage: crop) + .resizable() + .scaledToFit() + .frame(width: 100, height: 100) + .cornerRadius(8) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.purple, lineWidth: 2) + ) + } + } + if let crop = processor.targetFaceCrop { + VStack { + Text("Target 224x224") + .font(.caption2) + .foregroundColor(.secondary) + Image(uiImage: crop) + .resizable() + .scaledToFit() + .frame(width: 100, height: 100) + .cornerRadius(8) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.orange, lineWidth: 2) + ) + } + } + } + } + .padding(.horizontal) + } + } + } + + private func resultSection(_ image: UIImage) -> some View { + VStack(spacing: 8) { + Text("Swapped Result") + .font(.title3.bold()) + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + .overlay( + RoundedRectangle(cornerRadius: 12) + .stroke(Color.purple, lineWidth: 3) + ) + .shadow(color: .purple.opacity(0.3), radius: 10) + .padding(.horizontal) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/Info.plist b/creative_apps/SimSwapDemo/SimSwapDemo/Info.plist new file mode 100644 index 0000000..d835fe4 --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + Photo library access is needed to select face images for swapping. + + diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/SimSwapDemoApp.swift b/creative_apps/SimSwapDemo/SimSwapDemo/SimSwapDemoApp.swift new file mode 100644 index 0000000..ab197a4 --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/SimSwapDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct SimSwapDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo.xcodeproj/project.pbxproj b/creative_apps/Wav2LipDemo/Wav2LipDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..d35acf7 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo.xcodeproj/project.pbxproj @@ -0,0 +1,274 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + E10001 /* Wav2LipDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = E10002; }; + E10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E10004; }; + E10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = E10006; }; + E1WL02 /* Wav2Lip.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = E1WL01; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + E10007 /* Wav2LipDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Wav2LipDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + E10002 /* Wav2LipDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Wav2LipDemoApp.swift; sourceTree = ""; }; + E10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + E10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + E10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + E1WL01 /* Wav2Lip.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = Wav2Lip.mlpackage; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + E10009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + E10010 = { + isa = PBXGroup; + children = ( + E10011 /* Wav2LipDemo */, + E10012 /* Products */, + ); + sourceTree = ""; + }; + E10011 /* Wav2LipDemo */ = { + isa = PBXGroup; + children = ( + E10002 /* Wav2LipDemoApp.swift */, + E10004 /* ContentView.swift */, + E10006 /* Assets.xcassets */, + E10008 /* Info.plist */, + E1WL01 /* Wav2Lip.mlpackage */, + ); + path = Wav2LipDemo; + sourceTree = ""; + }; + E10012 /* Products */ = { + isa = PBXGroup; + children = ( + E10007 /* Wav2LipDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + E10013 /* Wav2LipDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = E10014; + buildPhases = ( + E10015 /* Sources */, + E10009 /* Frameworks */, + E10016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = Wav2LipDemo; + productName = Wav2LipDemo; + productReference = E10007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + E10017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + E10013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = E10018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = E10010; + productRefGroup = E10012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + E10013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + E10016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E10005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + E10015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E10001 /* Wav2LipDemoApp.swift in Sources */, + E10003 /* ContentView.swift in Sources */, + E1WL02 /* Wav2Lip.mlpackage in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + E10019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + E10020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + E10021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = Wav2LipDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.wav2lipdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + E10022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = Wav2LipDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.wav2lipdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + E10018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E10019, + E10020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + E10014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E10021, + E10022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = E10017; +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/Contents.json b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/ContentView.swift b/creative_apps/Wav2LipDemo/Wav2LipDemo/ContentView.swift new file mode 100644 index 0000000..54293df --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/ContentView.swift @@ -0,0 +1,648 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI +import AVFoundation + +// MARK: - Wav2Lip: Audio-Driven Talking Head Generation +// +// Wav2Lip takes a face image and a mel-spectrogram audio segment and generates +// a lip-synced face output. +// +// Model Input: +// - audio_mel (1,1,80,16): Mel-spectrogram of ~200ms audio chunk (80 mel bins x 16 time steps) +// - face_input (1,6,96,96): Concatenation of reference face (3ch) + masked lower-half face (3ch) +// +// Model Output: +// - output_face (1,3,96,96): Lip-synced face region +// +// For a full video, you would: +// 1. Extract face crops for each video frame +// 2. Compute mel-spectrogram for the entire audio +// 3. For each frame, pick the corresponding mel window and run inference +// 4. Paste the 96x96 output back into the original frame + +struct ContentView: View { + @StateObject private var viewModel = Wav2LipViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Face image picker + Section { + PhotosPicker(selection: $viewModel.selectedPhoto, + matching: .images) { + if let image = viewModel.faceImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 220) + .cornerRadius(12) + } else { + placeholderView( + title: "Select Portrait Photo", + systemImage: "person.crop.square" + ) + } + } + } header: { + sectionHeader("Face Image") + } + + // Audio section + Section { + VStack(spacing: 12) { + // Audio recorder + HStack(spacing: 16) { + Button(action: { viewModel.toggleRecording() }) { + VStack(spacing: 6) { + Image(systemName: viewModel.isRecording ? + "stop.circle.fill" : "mic.circle.fill") + .font(.system(size: 44)) + .foregroundColor(viewModel.isRecording ? .red : .accentColor) + Text(viewModel.isRecording ? "Stop" : "Record") + .font(.caption) + .foregroundColor(viewModel.isRecording ? .red : .accentColor) + } + } + + VStack(alignment: .leading, spacing: 4) { + if viewModel.isRecording { + HStack(spacing: 4) { + Circle() + .fill(.red) + .frame(width: 8, height: 8) + Text("Recording...") + .font(.subheadline) + .foregroundColor(.red) + } + Text(String(format: "%.1fs", viewModel.recordingDuration)) + .font(.caption) + .foregroundColor(.secondary) + } else if viewModel.audioURL != nil { + HStack { + Image(systemName: "checkmark.circle.fill") + .foregroundColor(.green) + Text("Audio recorded") + .font(.subheadline) + } + Text(String(format: "Duration: %.1fs", viewModel.recordingDuration)) + .font(.caption) + .foregroundColor(.secondary) + } else { + Text("Tap to record audio for lip sync") + .font(.subheadline) + .foregroundColor(.secondary) + } + } + + Spacer() + + // Playback button + if viewModel.audioURL != nil && !viewModel.isRecording { + Button(action: { viewModel.playRecordedAudio() }) { + Image(systemName: viewModel.isPlayingAudio ? + "speaker.wave.2.fill" : "play.circle") + .font(.title2) + .foregroundColor(.accentColor) + } + } + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + + // Audio waveform visualization + if viewModel.isRecording { + AudioLevelView(level: viewModel.audioLevel) + .frame(height: 40) + } + } + } header: { + sectionHeader("Audio Input") + } + + // Generate button + if viewModel.faceImage != nil && viewModel.audioURL != nil { + Button(action: { viewModel.generateLipSync() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "mouth.fill") + } + Text(viewModel.isProcessing ? "Generating..." : "Generate Lip Sync") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isProcessing ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isProcessing) + } + + // Processing status + if viewModel.isProcessing { + VStack(spacing: 8) { + ProgressView(value: viewModel.progress) + .progressViewStyle(.linear) + Text(viewModel.statusMessage) + .font(.caption) + .foregroundColor(.secondary) + } + .padding() + } + + // Error + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .frame(maxWidth: .infinity) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // Result display + if let result = viewModel.resultImage { + Section { + VStack(spacing: 12) { + HStack(spacing: 16) { + // Original face + VStack { + if let face = viewModel.faceImage { + Image(uiImage: face) + .resizable() + .scaledToFill() + .frame(width: 120, height: 120) + .clipped() + .cornerRadius(12) + } + Text("Original") + .font(.caption) + .foregroundColor(.secondary) + } + + Image(systemName: "arrow.right") + .font(.title3) + .foregroundColor(.secondary) + + // Lip-synced face + VStack { + Image(uiImage: result) + .resizable() + .scaledToFill() + .frame(width: 120, height: 120) + .clipped() + .cornerRadius(12) + Text("Lip-Synced") + .font(.caption) + .foregroundColor(.secondary) + } + } + + Text("Face + Audio = Lip-synced result (single frame preview)") + .font(.caption2) + .foregroundColor(.secondary) + + // Mel spectrogram visualization placeholder + MelSpectrogramView() + .frame(height: 60) + .cornerRadius(8) + } + } header: { + sectionHeader("Result") + } + } + } + .padding() + } + .navigationTitle("Wav2Lip") + } + } + + private func sectionHeader(_ title: String) -> some View { + HStack { + Text(title) + .font(.headline) + Spacer() + } + } + + private func placeholderView(title: String, systemImage: String) -> some View { + VStack(spacing: 12) { + Image(systemName: systemImage) + .font(.system(size: 40)) + .foregroundColor(.secondary) + Text(title) + .foregroundColor(.secondary) + } + .frame(maxWidth: .infinity) + .frame(height: 180) + .background(Color(.systemGray6)) + .cornerRadius(12) + } +} + +// MARK: - Audio Level Visualization + +struct AudioLevelView: View { + let level: Float + + var body: some View { + GeometryReader { geo in + HStack(spacing: 2) { + ForEach(0..<30, id: \.self) { i in + let barLevel = Float(i) / 30.0 + RoundedRectangle(cornerRadius: 1) + .fill(barLevel < level ? Color.green : Color(.systemGray5)) + .frame(width: (geo.size.width - 60) / 30) + } + } + .frame(height: geo.size.height) + } + } +} + +// MARK: - Mel Spectrogram Visualization + +struct MelSpectrogramView: View { + var body: some View { + GeometryReader { geo in + Canvas { context, size in + // Draw a placeholder mel-spectrogram visualization + let cols = 80 + let rows = 16 + let cellWidth = size.width / CGFloat(cols) + let cellHeight = size.height / CGFloat(rows) + + for row in 0.. UIImage { + await updateStatus("Loading model...", progress: 0.1) + + guard let modelURL = Bundle.main.url(forResource: "Wav2Lip", withExtension: "mlmodelc") else { + throw Wav2LipError.modelNotFound( + "Wav2Lip.mlmodelc not found in bundle. " + + "Please compile and add the Wav2Lip.mlpackage to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await updateStatus("Preparing face input...", progress: 0.3) + + // Prepare face input (1, 6, 96, 96) + // Channels 0-2: reference face RGB, Channels 3-5: lower-half masked face RGB + guard let face = faceImage, + let resizedFace = face.resized(to: CGSize(width: 96, height: 96)), + let cgFace = resizedFace.cgImage else { + throw Wav2LipError.processingFailed("Failed to prepare face image") + } + + let faceArray = try MLMultiArray(shape: [1, 6, 96, 96], dataType: .float32) + fillFaceInput(cgFace, into: faceArray) + + await updateStatus("Computing mel spectrogram...", progress: 0.5) + + // Prepare audio mel spectrogram (1, 1, 80, 16) + // In production: compute mel spectrogram from audio using Accelerate/vDSP + // - Sample rate: 16kHz + // - FFT size: 800, Hop: 200 + // - Mel bins: 80 + // - Time steps per chunk: 16 (~200ms of audio) + let melArray = try MLMultiArray(shape: [1, 1, 80, 16], dataType: .float32) + // Fill with placeholder mel values (in production: real mel spectrogram) + try fillPlaceholderMel(melArray) + + await updateStatus("Running inference...", progress: 0.7) + + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "audio_mel": MLFeatureValue(multiArray: melArray), + "face_input": MLFeatureValue(multiArray: faceArray) + ]) + + let prediction = try model.prediction(from: inputFeatures) + + await updateStatus("Extracting result...", progress: 0.9) + + guard let outputArray = prediction.featureValue(for: "output_face")?.multiArrayValue else { + throw Wav2LipError.processingFailed("Failed to extract output face") + } + + guard let resultImage = imageFromMultiArray(outputArray, width: 96, height: 96) else { + throw Wav2LipError.processingFailed("Failed to convert output to image") + } + + await updateStatus("Complete!", progress: 1.0) + return resultImage + } + + // Fill face_input MLMultiArray (1,6,96,96) from CGImage + // Channels 0-2: full face, Channels 3-5: lower-half masked + private func fillFaceInput(_ cgImage: CGImage, into array: MLMultiArray) { + let width = 96 + let height = 96 + let bytesPerPixel = 4 + var pixelData = [UInt8](repeating: 0, count: width * height * bytesPerPixel) + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: width, height: height, + bitsPerComponent: 8, bytesPerRow: bytesPerPixel * width, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + for y in 0.. height / 2 + array[[0, 3, y, x] as [NSNumber]] = NSNumber(value: isMasked ? 0.0 : r) + array[[0, 4, y, x] as [NSNumber]] = NSNumber(value: isMasked ? 0.0 : g) + array[[0, 5, y, x] as [NSNumber]] = NSNumber(value: isMasked ? 0.0 : b) + } + } + } + + // Fill placeholder mel spectrogram data + private func fillPlaceholderMel(_ array: MLMultiArray) throws { + // In production, compute real mel spectrogram from the recorded audio: + // 1. Load audio samples at 16kHz mono + // 2. Apply STFT with window=800, hop=200 + // 3. Apply mel filterbank (80 bins) + // 4. Take log magnitude + // 5. Extract 16-frame windows for each video frame + for mel in 0..<80 { + for t in 0..<16 { + let value = Float.random(in: -4.0...0.0) // Placeholder: log-mel range + array[[0, 0, mel, t] as [NSNumber]] = NSNumber(value: value) + } + } + } + + // Convert (1,3,96,96) MLMultiArray back to UIImage + private func imageFromMultiArray(_ array: MLMultiArray, width: Int, height: Int) -> UIImage? { + var pixelData = [UInt8](repeating: 255, count: width * height * 4) + + for y in 0.. UIImage? { + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + self.draw(in: CGRect(origin: .zero, size: targetSize)) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/Info.plist b/creative_apps/Wav2LipDemo/Wav2LipDemo/Info.plist new file mode 100644 index 0000000..ab9c205 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/Info.plist @@ -0,0 +1,12 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select portrait photos for lip sync generation. + NSMicrophoneUsageDescription + This app uses the microphone to record audio that drives lip sync animation. + NSCameraUsageDescription + This app may use the camera to capture face photos for lip sync generation. + + diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/Wav2LipDemoApp.swift b/creative_apps/Wav2LipDemo/Wav2LipDemo/Wav2LipDemoApp.swift new file mode 100644 index 0000000..0cb8b44 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/Wav2LipDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct Wav2LipDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo.xcodeproj/project.pbxproj b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..9c31e21 --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + B20000010000000000000001 /* ConvNeXtTinyDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000001 /* ConvNeXtTinyDemoApp.swift */; }; + B20000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000002 /* ContentView.swift */; }; + B20000010000000000000003 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000003 /* ImageNetLabels.swift */; }; + B20000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B20000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + B20000020000000000000001 /* ConvNeXtTinyDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvNeXtTinyDemoApp.swift; sourceTree = ""; }; + B20000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + B20000020000000000000003 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + B20000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + B20000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + B20000020000000000000010 /* ConvNeXtTinyDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = ConvNeXtTinyDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + B20000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + B20000040000000000000001 = { + isa = PBXGroup; + children = ( + B20000040000000000000002 /* ConvNeXtTinyDemo */, + B20000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + B20000040000000000000002 /* ConvNeXtTinyDemo */ = { + isa = PBXGroup; + children = ( + B20000020000000000000001 /* ConvNeXtTinyDemoApp.swift */, + B20000020000000000000002 /* ContentView.swift */, + B20000020000000000000003 /* ImageNetLabels.swift */, + B20000020000000000000004 /* Assets.xcassets */, + B20000020000000000000005 /* Info.plist */, + ); + path = ConvNeXtTinyDemo; + sourceTree = ""; + }; + B20000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + B20000020000000000000010 /* ConvNeXtTinyDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + B20000050000000000000001 /* ConvNeXtTinyDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = B20000070000000000000001 /* Build configuration list for PBXNativeTarget "ConvNeXtTinyDemo" */; + buildPhases = ( + B20000060000000000000001 /* Sources */, + B20000030000000000000001 /* Frameworks */, + B20000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = ConvNeXtTinyDemo; + productName = ConvNeXtTinyDemo; + productReference = B20000020000000000000010 /* ConvNeXtTinyDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + B20000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + B20000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = B20000070000000000000003 /* Build configuration list for PBXProject "ConvNeXtTinyDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = B20000040000000000000001; + productRefGroup = B20000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + B20000050000000000000001 /* ConvNeXtTinyDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + B20000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + B20000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000010000000000000001 /* ConvNeXtTinyDemoApp.swift in Sources */, + B20000010000000000000002 /* ContentView.swift in Sources */, + B20000010000000000000003 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + B20000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + B20000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + B20000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = ConvNeXtTinyDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.convnexttiny"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + B20000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = ConvNeXtTinyDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.convnexttiny"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + B20000070000000000000001 /* Build configuration list for PBXNativeTarget "ConvNeXtTinyDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000090000000000000003 /* Debug */, + B20000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + B20000070000000000000003 /* Build configuration list for PBXProject "ConvNeXtTinyDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000090000000000000001 /* Debug */, + B20000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = B20000080000000000000001 /* Project object */; +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/Contents.json b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ContentView.swift b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ContentView.swift new file mode 100644 index 0000000..2748d09 --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ContentView.swift @@ -0,0 +1,225 @@ +import SwiftUI +import UIKit +import PhotosUI +import CoreML +import Vision + +// MARK: - Classifier + +class ConvNeXtClassifier: ObservableObject { + @Published var predictions: [(label: String, confidence: Float)] = [] + @Published var errorMessage: String? + @Published var isProcessing = false + + private var vnModel: VNCoreMLModel? + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add ConvNeXtTiny.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "ConvNeXtTiny", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add ConvNeXtTiny.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func classify(image: UIImage) { + guard let vnModel = vnModel else { return } + guard let cgImage = image.cgImage else { return } + + DispatchQueue.main.async { self.isProcessing = true } + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processResults(multiArray: multiArray) + } else if let results = request.results as? [VNClassificationObservation] { + let top5 = results.prefix(5).map { (label: $0.identifier, confidence: $0.confidence) } + DispatchQueue.main.async { + self?.predictions = top5 + self?.isProcessing = false + } + } + } + request.imageCropAndScaleOption = .centerCrop + + DispatchQueue.global(qos: .userInitiated).async { + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + try? handler.perform([request]) + } + } + + private func processResults(multiArray: MLMultiArray) { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. Color { + switch index { + case 0: return .blue + case 1: return .green + case 2: return .orange + case 3: return .purple + case 4: return .pink + default: return .gray + } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ConvNeXtTinyDemoApp.swift b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ConvNeXtTinyDemoApp.swift new file mode 100644 index 0000000..eaf67e0 --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ConvNeXtTinyDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct ConvNeXtTinyDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ImageNetLabels.swift b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Info.plist b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Info.plist new file mode 100644 index 0000000..6631ffa --- /dev/null +++ b/sample_apps/ConvNeXtTinyDemo/ConvNeXtTinyDemo/Info.plist @@ -0,0 +1,6 @@ + + + + + + diff --git a/sample_apps/DeepLabV3Demo/DeepLabV3Demo.xcodeproj/project.pbxproj b/sample_apps/DeepLabV3Demo/DeepLabV3Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..6341725 --- /dev/null +++ b/sample_apps/DeepLabV3Demo/DeepLabV3Demo.xcodeproj/project.pbxproj @@ -0,0 +1,344 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + C30000010000000000000001 /* DeepLabV3DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000001 /* DeepLabV3DemoApp.swift */; }; + C30000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000002 /* ContentView.swift */; }; + C30000010000000000000003 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000003 /* ImageNetLabels.swift */; }; + C30000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C30000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + C30000020000000000000001 /* DeepLabV3DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeepLabV3DemoApp.swift; sourceTree = ""; }; + C30000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + C30000020000000000000003 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + C30000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + C30000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + C30000020000000000000010 /* DeepLabV3Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DeepLabV3Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + C30000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + C30000040000000000000001 = { + isa = PBXGroup; + children = ( + C30000040000000000000002 /* DeepLabV3Demo */, + C30000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + C30000040000000000000002 /* DeepLabV3Demo */ = { + isa = PBXGroup; + children = ( + C30000020000000000000001 /* DeepLabV3DemoApp.swift */, + C30000020000000000000002 /* ContentView.swift */, + C30000020000000000000003 /* ImageNetLabels.swift */, + C30000020000000000000004 /* Assets.xcassets */, + C30000020000000000000005 /* Info.plist */, + ); + path = DeepLabV3Demo; + sourceTree = ""; + }; + C30000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + C30000020000000000000010 /* DeepLabV3Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + C30000050000000000000001 /* DeepLabV3Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = C30000070000000000000001 /* Build configuration list for PBXNativeTarget "DeepLabV3Demo" */; + buildPhases = ( + C30000060000000000000001 /* Sources */, + C30000030000000000000001 /* Frameworks */, + C30000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = DeepLabV3Demo; + productName = DeepLabV3Demo; + productReference = C30000020000000000000010 /* DeepLabV3Demo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + C30000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + C30000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = C30000070000000000000003 /* Build configuration list for PBXProject "DeepLabV3Demo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = C30000040000000000000001; + productRefGroup = C30000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + C30000050000000000000001 /* DeepLabV3Demo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + C30000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + C30000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000010000000000000001 /* DeepLabV3DemoApp.swift in Sources */, + C30000010000000000000002 /* ContentView.swift in Sources */, + C30000010000000000000003 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + C30000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + C30000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + C30000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DeepLabV3Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time scene segmentation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.deeplabv3"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + C30000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DeepLabV3Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time scene segmentation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.deeplabv3"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + C30000070000000000000001 /* Build configuration list for PBXNativeTarget "DeepLabV3Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000090000000000000003 /* Debug */, + C30000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C30000070000000000000003 /* Build configuration list for PBXProject "DeepLabV3Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000090000000000000001 /* Debug */, + C30000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = C30000080000000000000001 /* Project object */; +} diff --git a/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/Contents.json b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DeepLabV3Demo/DeepLabV3Demo/ContentView.swift b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/ContentView.swift new file mode 100644 index 0000000..db1c5fa --- /dev/null +++ b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/ContentView.swift @@ -0,0 +1,371 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Vision +import Accelerate + +// MARK: - Segmentation Classes + +struct SegmentationClass { + let name: String + let color: SIMD4 // RGBA +} + +let segmentationClasses: [SegmentationClass] = [ + SegmentationClass(name: "Background", color: SIMD4(0, 0, 0, 0)), + SegmentationClass(name: "Aeroplane", color: SIMD4(128, 0, 0, 180)), + SegmentationClass(name: "Bicycle", color: SIMD4(0, 128, 0, 180)), + SegmentationClass(name: "Bird", color: SIMD4(128, 128, 0, 180)), + SegmentationClass(name: "Boat", color: SIMD4(0, 0, 128, 180)), + SegmentationClass(name: "Bottle", color: SIMD4(128, 0, 128, 180)), + SegmentationClass(name: "Bus", color: SIMD4(0, 128, 128, 180)), + SegmentationClass(name: "Car", color: SIMD4(128, 128, 128, 180)), + SegmentationClass(name: "Cat", color: SIMD4(64, 0, 0, 180)), + SegmentationClass(name: "Chair", color: SIMD4(192, 0, 0, 180)), + SegmentationClass(name: "Cow", color: SIMD4(64, 128, 0, 180)), + SegmentationClass(name: "Dining Table", color: SIMD4(192, 128, 0, 180)), + SegmentationClass(name: "Dog", color: SIMD4(64, 0, 128, 180)), + SegmentationClass(name: "Horse", color: SIMD4(192, 0, 128, 180)), + SegmentationClass(name: "Motorbike", color: SIMD4(64, 128, 128, 180)), + SegmentationClass(name: "Person", color: SIMD4(192, 128, 128, 180)), + SegmentationClass(name: "Potted Plant", color: SIMD4(0, 64, 0, 180)), + SegmentationClass(name: "Sheep", color: SIMD4(128, 64, 0, 180)), + SegmentationClass(name: "Sofa", color: SIMD4(0, 192, 0, 180)), + SegmentationClass(name: "Train", color: SIMD4(128, 192, 0, 180)), + SegmentationClass(name: "TV/Monitor", color: SIMD4(0, 64, 128, 180)) +] + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .high + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Segmentation Engine + +class SegmentationEngine: ObservableObject { + @Published var overlayImage: UIImage? + @Published var detectedClasses: [String] = [] + @Published var errorMessage: String? + + private var vnModel: VNCoreMLModel? + private var isProcessing = false + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add DeepLabV3MobileNetV3.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "DeepLabV3MobileNetV3", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add DeepLabV3MobileNetV3.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func segment(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + defer { self?.isProcessing = false } + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processSegmentation(multiArray: multiArray) + } + } + request.imageCropAndScaleOption = .scaleFill + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + } + + private func processSegmentation(multiArray: MLMultiArray) { + // Output shape: 1 x 21 x 512 x 512 + let numClasses = 21 + let height = 512 + let width = 512 + + let pointer = multiArray.dataPointer.bindMemory(to: Float.self, capacity: multiArray.count) + + // For each pixel, find the class with highest score (argmax across 21 classes) + var pixelData = [UInt8](repeating: 0, count: width * height * 4) // RGBA + var foundClasses = Set() + + for y in 0.. maxVal { + maxVal = val + maxClass = c + } + } + + if maxClass != 0 { + foundClasses.insert(maxClass) + } + + let color = segmentationClasses[maxClass].color + let pixelIndex = (y * width + x) * 4 + pixelData[pixelIndex] = color.x // R + pixelData[pixelIndex + 1] = color.y // G + pixelData[pixelIndex + 2] = color.z // B + pixelData[pixelIndex + 3] = color.w // A + } + } + + // Create UIImage from pixel data + let colorSpace = CGColorSpaceCreateDeviceRGB() + let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue) + + guard let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: bitmapInfo.rawValue + ), let cgImage = context.makeImage() else { return } + + let image = UIImage(cgImage: cgImage) + let classes = foundClasses.sorted().map { segmentationClasses[$0].name } + + DispatchQueue.main.async { + self.overlayImage = image + self.detectedClasses = classes + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var camera = CameraManager() + @StateObject private var segEngine = SegmentationEngine() + @State private var showLegend = false + + var body: some View { + ZStack { + // Camera feed + CameraPreview(session: camera.session) + .ignoresSafeArea() + + // Segmentation overlay + if let overlay = segEngine.overlayImage { + Image(uiImage: overlay) + .resizable() + .scaledToFill() + .ignoresSafeArea() + .allowsHitTesting(false) + } + + VStack { + // Top bar with title and legend toggle + HStack { + Text("DeepLabV3 Segmentation") + .font(.headline) + .foregroundColor(.white) + .shadow(radius: 2) + + Spacer() + + Button(action: { showLegend.toggle() }) { + Image(systemName: "list.bullet") + .font(.title3) + .foregroundColor(.white) + .padding(8) + .background(.black.opacity(0.5), in: Circle()) + } + } + .padding() + + Spacer() + + // Error message + if let error = segEngine.errorMessage { + VStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .font(.largeTitle) + .foregroundColor(.yellow) + Text(error) + .font(.caption) + .multilineTextAlignment(.center) + .padding(.horizontal) + } + .padding() + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) + .padding() + } + + // Detected classes + if !segEngine.detectedClasses.isEmpty { + ScrollView(.horizontal, showsIndicators: false) { + HStack(spacing: 8) { + ForEach(segEngine.detectedClasses, id: \.self) { className in + Text(className) + .font(.caption) + .fontWeight(.medium) + .padding(.horizontal, 10) + .padding(.vertical, 4) + .background(.black.opacity(0.6)) + .foregroundColor(.white) + .cornerRadius(12) + } + } + .padding(.horizontal) + } + .padding(.bottom, 8) + } + } + + // Legend sheet + if showLegend { + VStack { + HStack { + Text("Class Legend") + .font(.headline) + Spacer() + Button("Done") { showLegend = false } + } + .padding() + + ScrollView { + LazyVGrid(columns: [GridItem(.flexible()), GridItem(.flexible())], spacing: 8) { + ForEach(1.. String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Info.plist b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Info.plist new file mode 100644 index 0000000..5eacaea --- /dev/null +++ b/sample_apps/DeepLabV3Demo/DeepLabV3Demo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time scene segmentation. + + diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo.xcodeproj/project.pbxproj b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..1f31744 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + A10000001 /* EfficientFormerV2DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000011 /* EfficientFormerV2DemoApp.swift */; }; + A10000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000012 /* ContentView.swift */; }; + A10000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10000013 /* Assets.xcassets */; }; + A10000004 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000014 /* ImageNetLabels.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + A10000010 /* EfficientFormerV2Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = EfficientFormerV2Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + A10000011 /* EfficientFormerV2DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EfficientFormerV2DemoApp.swift; sourceTree = ""; }; + A10000012 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + A10000013 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + A10000014 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + A10000015 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + A10000020 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + A10000030 = { + isa = PBXGroup; + children = ( + A10000031 /* EfficientFormerV2Demo */, + A10000032 /* Products */, + ); + sourceTree = ""; + }; + A10000031 /* EfficientFormerV2Demo */ = { + isa = PBXGroup; + children = ( + A10000011 /* EfficientFormerV2DemoApp.swift */, + A10000012 /* ContentView.swift */, + A10000014 /* ImageNetLabels.swift */, + A10000013 /* Assets.xcassets */, + A10000015 /* Info.plist */, + ); + path = EfficientFormerV2Demo; + sourceTree = ""; + }; + A10000032 /* Products */ = { + isa = PBXGroup; + children = ( + A10000010 /* EfficientFormerV2Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + A10000040 /* EfficientFormerV2Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = A10000060 /* Build configuration list for PBXNativeTarget "EfficientFormerV2Demo" */; + buildPhases = ( + A10000041 /* Sources */, + A10000020 /* Frameworks */, + A10000042 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = EfficientFormerV2Demo; + productName = EfficientFormerV2Demo; + productReference = A10000010 /* EfficientFormerV2Demo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + A10000050 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + A10000040 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = A10000070 /* Build configuration list for PBXProject "EfficientFormerV2Demo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = A10000030; + productRefGroup = A10000032 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + A10000040 /* EfficientFormerV2Demo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + A10000042 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + A10000041 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000001 /* EfficientFormerV2DemoApp.swift in Sources */, + A10000002 /* ContentView.swift in Sources */, + A10000004 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + A10000061 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = EfficientFormerV2Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.efficientformerv2demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + A10000062 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = EfficientFormerV2Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.efficientformerv2demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + A10000071 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + A10000072 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + A10000060 /* Build configuration list for PBXNativeTarget "EfficientFormerV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000061 /* Debug */, + A10000062 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + A10000070 /* Build configuration list for PBXProject "EfficientFormerV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000071 /* Debug */, + A10000072 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = A10000050 /* Project object */; +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/Contents.json b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ContentView.swift b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ContentView.swift new file mode 100644 index 0000000..98c4ff2 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ContentView.swift @@ -0,0 +1,306 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - Side-by-Side Comparison Classifier +// Uses EfficientFormerV2_S0 model (224x224 input, 1000-class ImageNet output) +// Output feature name: "var_1617" + +struct ContentView: View { + @StateObject private var classifier = SideBySideClassifier() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + Text("Pick two photos and compare classification results side by side.") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + .padding(.horizontal) + + if let error = classifier.errorMessage { + ErrorBanner(message: error) + } + + // Side-by-side panels + HStack(spacing: 12) { + ImagePanel( + title: "Image A", + image: classifier.imageA, + results: classifier.resultsA, + isProcessing: classifier.isProcessingA, + selectedItem: $classifier.photoItemA + ) + + ImagePanel( + title: "Image B", + image: classifier.imageB, + results: classifier.resultsB, + isProcessing: classifier.isProcessingB, + selectedItem: $classifier.photoItemB + ) + } + .padding(.horizontal) + + // Clear button + if classifier.imageA != nil || classifier.imageB != nil { + Button(role: .destructive) { + classifier.clearAll() + } label: { + Label("Clear All", systemImage: "trash") + .frame(maxWidth: .infinity) + } + .buttonStyle(.bordered) + .padding(.horizontal) + } + } + .padding(.vertical) + } + .navigationTitle("EfficientFormerV2") + .navigationBarTitleDisplayMode(.large) + } + } +} + +// MARK: - Image Panel View +struct ImagePanel: View { + let title: String + let image: UIImage? + let results: [(label: String, score: Float)] + let isProcessing: Bool + @Binding var selectedItem: PhotosPickerItem? + + var body: some View { + VStack(spacing: 8) { + Text(title) + .font(.headline) + + // Photo picker area + PhotosPicker(selection: $selectedItem, matching: .images) { + Group { + if let image = image { + Image(uiImage: image) + .resizable() + .scaledToFill() + .frame(height: 160) + .clipped() + .cornerRadius(10) + } else { + RoundedRectangle(cornerRadius: 10) + .fill(Color(.systemGray6)) + .frame(height: 160) + .overlay { + VStack(spacing: 6) { + Image(systemName: "photo.badge.plus") + .font(.title2) + Text("Select Photo") + .font(.caption) + } + .foregroundColor(.secondary) + } + } + } + } + + // Results + if isProcessing { + ProgressView("Classifying...") + .font(.caption) + } else if !results.isEmpty { + VStack(alignment: .leading, spacing: 4) { + ForEach(Array(results.prefix(5).enumerated()), id: \.offset) { _, result in + HStack { + Text(result.label) + .font(.caption2) + .lineLimit(1) + Spacer() + Text(String(format: "%.1f%%", result.score * 100)) + .font(.caption2) + .foregroundColor(.secondary) + } + // Confidence bar + GeometryReader { geo in + RoundedRectangle(cornerRadius: 2) + .fill(Color.accentColor.opacity(0.3)) + .frame(width: geo.size.width * CGFloat(result.score)) + } + .frame(height: 3) + } + } + } + } + .frame(maxWidth: .infinity) + } +} + +// MARK: - Error Banner +struct ErrorBanner: View { + let message: String + + var body: some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color(.systemOrange).opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } +} + +// MARK: - Classifier ViewModel +@MainActor +class SideBySideClassifier: ObservableObject { + @Published var imageA: UIImage? + @Published var imageB: UIImage? + @Published var resultsA: [(label: String, score: Float)] = [] + @Published var resultsB: [(label: String, score: Float)] = [] + @Published var isProcessingA = false + @Published var isProcessingB = false + @Published var errorMessage: String? + + @Published var photoItemA: PhotosPickerItem? { + didSet { Task { await loadImage(from: photoItemA, side: .a) } } + } + @Published var photoItemB: PhotosPickerItem? { + didSet { Task { await loadImage(from: photoItemB, side: .b) } } + } + + private var vnModel: VNCoreMLModel? + + enum Side { case a, b } + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add EfficientFormerV2_S0.mlpackage to the Xcode project. + // The compiled model class will be generated automatically by Xcode. + // Download from the converted_models directory and drag into the project navigator. + do { + guard let modelURL = Bundle.main.url(forResource: "EfficientFormerV2_S0", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Add EfficientFormerV2_S0.mlpackage to the project." + return + } + let mlModel = try MLModel(contentsOf: modelURL) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + private func loadImage(from item: PhotosPickerItem?, side: Side) async { + guard let item = item, + let data = try? await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) else { return } + + switch side { + case .a: + imageA = uiImage + resultsA = [] + isProcessingA = true + case .b: + imageB = uiImage + resultsB = [] + isProcessingB = true + } + + await classify(image: uiImage, side: side) + } + + private func classify(image: UIImage, side: Side) async { + guard let vnModel = vnModel else { + switch side { + case .a: isProcessingA = false + case .b: isProcessingB = false + } + return + } + + guard let cgImage = image.cgImage else { + switch side { + case .a: isProcessingA = false + case .b: isProcessingB = false + } + return + } + + let request = VNCoreMLRequest(model: vnModel) + request.imageCropAndScaleOption = .centerCrop + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + + do { + try handler.perform([request]) + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + // Extract scores from the "var_1617" output + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. [Float] { + let maxVal = input.max() ?? 0 + let expValues = input.map { exp($0 - maxVal) } + let sumExp = expValues.reduce(0, +) + return expValues.map { $0 / sumExp } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/EfficientFormerV2DemoApp.swift b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/EfficientFormerV2DemoApp.swift new file mode 100644 index 0000000..c2387fb --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/EfficientFormerV2DemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct EfficientFormerV2DemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ImageNetLabels.swift b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Info.plist b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Info.plist new file mode 100644 index 0000000..3faede7 --- /dev/null +++ b/sample_apps/EfficientFormerV2Demo/EfficientFormerV2Demo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select images for classification. + + diff --git a/sample_apps/FastViTDemo/FastViTDemo.xcodeproj/project.pbxproj b/sample_apps/FastViTDemo/FastViTDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..29554aa --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo.xcodeproj/project.pbxproj @@ -0,0 +1,344 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + D40000010000000000000001 /* FastViTDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000001 /* FastViTDemoApp.swift */; }; + D40000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000002 /* ContentView.swift */; }; + D40000010000000000000003 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000003 /* ImageNetLabels.swift */; }; + D40000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = D40000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + D40000020000000000000001 /* FastViTDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FastViTDemoApp.swift; sourceTree = ""; }; + D40000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + D40000020000000000000003 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + D40000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + D40000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + D40000020000000000000010 /* FastViTDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = FastViTDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + D40000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + D40000040000000000000001 = { + isa = PBXGroup; + children = ( + D40000040000000000000002 /* FastViTDemo */, + D40000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + D40000040000000000000002 /* FastViTDemo */ = { + isa = PBXGroup; + children = ( + D40000020000000000000001 /* FastViTDemoApp.swift */, + D40000020000000000000002 /* ContentView.swift */, + D40000020000000000000003 /* ImageNetLabels.swift */, + D40000020000000000000004 /* Assets.xcassets */, + D40000020000000000000005 /* Info.plist */, + ); + path = FastViTDemo; + sourceTree = ""; + }; + D40000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + D40000020000000000000010 /* FastViTDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + D40000050000000000000001 /* FastViTDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = D40000070000000000000001 /* Build configuration list for PBXNativeTarget "FastViTDemo" */; + buildPhases = ( + D40000060000000000000001 /* Sources */, + D40000030000000000000001 /* Frameworks */, + D40000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = FastViTDemo; + productName = FastViTDemo; + productReference = D40000020000000000000010 /* FastViTDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + D40000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + D40000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = D40000070000000000000003 /* Build configuration list for PBXProject "FastViTDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = D40000040000000000000001; + productRefGroup = D40000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + D40000050000000000000001 /* FastViTDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + D40000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + D40000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000010000000000000001 /* FastViTDemoApp.swift in Sources */, + D40000010000000000000002 /* ContentView.swift in Sources */, + D40000010000000000000003 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + D40000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + D40000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + D40000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = FastViTDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for image classification speed benchmarks."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.fastvit"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + D40000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = FastViTDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for image classification speed benchmarks."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.fastvit"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + D40000070000000000000001 /* Build configuration list for PBXNativeTarget "FastViTDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000090000000000000003 /* Debug */, + D40000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + D40000070000000000000003 /* Build configuration list for PBXProject "FastViTDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000090000000000000001 /* Debug */, + D40000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = D40000080000000000000001 /* Project object */; +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/Contents.json b/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/ContentView.swift b/sample_apps/FastViTDemo/FastViTDemo/ContentView.swift new file mode 100644 index 0000000..1788a74 --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/ContentView.swift @@ -0,0 +1,454 @@ +import SwiftUI +import UIKit +import AVFoundation +import PhotosUI +import CoreML +import Vision + +// MARK: - Benchmark Classifier + +class FastViTClassifier: ObservableObject { + @Published var predictions: [(label: String, confidence: Float)] = [] + @Published var inferenceTimeMs: Double = 0 + @Published var averageTimeMs: Double = 0 + @Published var errorMessage: String? + @Published var isProcessing = false + + private var vnModel: VNCoreMLModel? + private var recentTimes: [Double] = [] + private let maxRecentTimes = 20 + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add FastViT_T8.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "FastViT_T8", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add FastViT_T8.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func classify(image: UIImage) { + guard let vnModel = vnModel else { return } + guard let cgImage = image.cgImage else { return } + + DispatchQueue.main.async { self.isProcessing = true } + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processResults(multiArray: multiArray) + } else if let results = request.results as? [VNClassificationObservation] { + let top5 = results.prefix(5).map { (label: $0.identifier, confidence: $0.confidence) } + DispatchQueue.main.async { + self?.predictions = top5 + self?.isProcessing = false + } + } + } + request.imageCropAndScaleOption = .centerCrop + + DispatchQueue.global(qos: .userInitiated).async { + let startTime = CFAbsoluteTimeGetCurrent() + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + try? handler.perform([request]) + let elapsed = (CFAbsoluteTimeGetCurrent() - startTime) * 1000.0 + + DispatchQueue.main.async { [weak self] in + self?.inferenceTimeMs = elapsed + self?.recentTimes.append(elapsed) + if let count = self?.recentTimes.count, count > (self?.maxRecentTimes ?? 20) { + self?.recentTimes.removeFirst() + } + self?.averageTimeMs = (self?.recentTimes.reduce(0, +) ?? 0) / Double(self?.recentTimes.count ?? 1) + self?.isProcessing = false + } + } + } + + func classifyBuffer(sampleBuffer: CMSampleBuffer) { + guard let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processResults(multiArray: multiArray) + } else if let results = request.results as? [VNClassificationObservation] { + let top5 = results.prefix(5).map { (label: $0.identifier, confidence: $0.confidence) } + DispatchQueue.main.async { + self?.predictions = top5 + } + } + } + request.imageCropAndScaleOption = .centerCrop + + let startTime = CFAbsoluteTimeGetCurrent() + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + let elapsed = (CFAbsoluteTimeGetCurrent() - startTime) * 1000.0 + + DispatchQueue.main.async { [weak self] in + self?.inferenceTimeMs = elapsed + self?.recentTimes.append(elapsed) + if let count = self?.recentTimes.count, count > (self?.maxRecentTimes ?? 20) { + self?.recentTimes.removeFirst() + } + self?.averageTimeMs = (self?.recentTimes.reduce(0, +) ?? 0) / Double(self?.recentTimes.count ?? 1) + } + } + + private func processResults(multiArray: MLMultiArray) { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. Void)? + private var isProcessing = false + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .medium + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + guard !isProcessing else { return } + isProcessing = true + onFrame?(sampleBuffer) + isProcessing = false + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { Coordinator() } + class Coordinator { var previewLayer: AVCaptureVideoPreviewLayer? } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var classifier = FastViTClassifier() + @StateObject private var camera = CameraManager() + @State private var selectedItem: PhotosPickerItem? + @State private var selectedImage: UIImage? + @State private var mode: InputMode = .camera + + enum InputMode: String, CaseIterable { + case camera = "Camera" + case photo = "Photo" + } + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + // Mode picker + Picker("Input", selection: $mode) { + ForEach(InputMode.allCases, id: \.self) { m in + Text(m.rawValue).tag(m) + } + } + .pickerStyle(.segmented) + .padding() + + // Timing display - prominently shown + VStack(spacing: 4) { + HStack(spacing: 20) { + VStack { + Text("Last") + .font(.caption2) + .foregroundColor(.secondary) + Text(String(format: "%.1f ms", classifier.inferenceTimeMs)) + .font(.system(.title, design: .monospaced)) + .fontWeight(.bold) + .foregroundColor(.blue) + } + + Divider().frame(height: 40) + + VStack { + Text("Average") + .font(.caption2) + .foregroundColor(.secondary) + Text(String(format: "%.1f ms", classifier.averageTimeMs)) + .font(.system(.title, design: .monospaced)) + .fontWeight(.bold) + .foregroundColor(.green) + } + + Divider().frame(height: 40) + + VStack { + Text("FPS") + .font(.caption2) + .foregroundColor(.secondary) + Text(classifier.averageTimeMs > 0 ? String(format: "%.0f", 1000.0 / classifier.averageTimeMs) : "--") + .font(.system(.title, design: .monospaced)) + .fontWeight(.bold) + .foregroundColor(.orange) + } + } + } + .padding(.vertical, 8) + .frame(maxWidth: .infinity) + .background(Color(.systemGroupedBackground)) + + // Content area + ZStack { + if mode == .camera { + CameraPreview(session: camera.session) + } else { + Color(.systemGroupedBackground) + if let image = selectedImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .clipShape(RoundedRectangle(cornerRadius: 12)) + .padding() + } else { + VStack(spacing: 16) { + Image(systemName: "photo.on.rectangle.angled") + .font(.system(size: 50)) + .foregroundColor(.secondary) + Text("Select a photo to benchmark") + .foregroundColor(.secondary) + } + } + } + + if classifier.isProcessing { + ProgressView("Running benchmark...") + .padding() + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 12)) + } + } + .frame(maxHeight: .infinity) + + // Error + if let error = classifier.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(error) + .font(.caption) + } + .padding(8) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + // Predictions + if !classifier.predictions.isEmpty { + VStack(alignment: .leading, spacing: 6) { + ForEach(Array(classifier.predictions.enumerated()), id: \.offset) { index, pred in + HStack { + Text("\(index + 1). \(pred.label)") + .font(.system(.caption, design: .monospaced)) + .fontWeight(index == 0 ? .bold : .regular) + Spacer() + Text(String(format: "%.1f%%", pred.confidence * 100)) + .font(.system(.caption, design: .monospaced)) + .foregroundColor(.secondary) + } + } + } + .padding() + .background(Color(.systemBackground)) + } + + // Bottom controls + if mode == .photo { + HStack { + PhotosPicker(selection: $selectedItem, matching: .images) { + Label("Choose Photo", systemImage: "photo.fill") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + + if selectedImage != nil { + Button { + if let img = selectedImage { + classifier.runBenchmark(image: img, iterations: 10) + } + } label: { + Label("Bench x10", systemImage: "speedometer") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.orange) + .foregroundColor(.white) + .cornerRadius(12) + } + } + } + .padding() + } + } + .navigationTitle("FastViT-T8 Benchmark") + .navigationBarTitleDisplayMode(.inline) + } + .onChange(of: selectedItem) { newItem in + Task { + if let data = try? await newItem?.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) { + selectedImage = uiImage + classifier.classify(image: uiImage) + } + } + } + .onChange(of: mode) { newMode in + if newMode == .camera { + camera.onFrame = { [weak classifier] buffer in + classifier?.classifyBuffer(sampleBuffer: buffer) + } + camera.configure() + } else { + camera.stop() + } + } + .onAppear { + if mode == .camera { + camera.onFrame = { [weak classifier] buffer in + classifier?.classifyBuffer(sampleBuffer: buffer) + } + camera.configure() + } + } + .onDisappear { + camera.stop() + } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/FastViTDemoApp.swift b/sample_apps/FastViTDemo/FastViTDemo/FastViTDemoApp.swift new file mode 100644 index 0000000..1532bee --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/FastViTDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct FastViTDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/ImageNetLabels.swift b/sample_apps/FastViTDemo/FastViTDemo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/FastViTDemo/FastViTDemo/Info.plist b/sample_apps/FastViTDemo/FastViTDemo/Info.plist new file mode 100644 index 0000000..c3c3f29 --- /dev/null +++ b/sample_apps/FastViTDemo/FastViTDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for image classification speed benchmarks. + + diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo.xcodeproj/project.pbxproj b/sample_apps/GhostNetV2Demo/GhostNetV2Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..99661d4 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + B20000001 /* GhostNetV2DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000011 /* GhostNetV2DemoApp.swift */; }; + B20000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000012 /* ContentView.swift */; }; + B20000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B20000013 /* Assets.xcassets */; }; + B20000004 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = B20000014 /* ImageNetLabels.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + B20000010 /* GhostNetV2Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = GhostNetV2Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + B20000011 /* GhostNetV2DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GhostNetV2DemoApp.swift; sourceTree = ""; }; + B20000012 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + B20000013 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + B20000014 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + B20000015 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + B20000020 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + B20000030 = { + isa = PBXGroup; + children = ( + B20000031 /* GhostNetV2Demo */, + B20000032 /* Products */, + ); + sourceTree = ""; + }; + B20000031 /* GhostNetV2Demo */ = { + isa = PBXGroup; + children = ( + B20000011 /* GhostNetV2DemoApp.swift */, + B20000012 /* ContentView.swift */, + B20000014 /* ImageNetLabels.swift */, + B20000013 /* Assets.xcassets */, + B20000015 /* Info.plist */, + ); + path = GhostNetV2Demo; + sourceTree = ""; + }; + B20000032 /* Products */ = { + isa = PBXGroup; + children = ( + B20000010 /* GhostNetV2Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + B20000040 /* GhostNetV2Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = B20000060 /* Build configuration list for PBXNativeTarget "GhostNetV2Demo" */; + buildPhases = ( + B20000041 /* Sources */, + B20000020 /* Frameworks */, + B20000042 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = GhostNetV2Demo; + productName = GhostNetV2Demo; + productReference = B20000010 /* GhostNetV2Demo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + B20000050 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + B20000040 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = B20000070 /* Build configuration list for PBXProject "GhostNetV2Demo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = B20000030; + productRefGroup = B20000032 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + B20000040 /* GhostNetV2Demo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + B20000042 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + B20000041 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + B20000001 /* GhostNetV2DemoApp.swift in Sources */, + B20000002 /* ContentView.swift in Sources */, + B20000004 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + B20000061 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = GhostNetV2Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.ghostnetv2demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + B20000062 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = GhostNetV2Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.ghostnetv2demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + B20000071 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + B20000072 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + B20000060 /* Build configuration list for PBXNativeTarget "GhostNetV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000061 /* Debug */, + B20000062 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + B20000070 /* Build configuration list for PBXProject "GhostNetV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + B20000071 /* Debug */, + B20000072 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = B20000050 /* Project object */; +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/Contents.json b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ContentView.swift b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ContentView.swift new file mode 100644 index 0000000..768f755 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ContentView.swift @@ -0,0 +1,343 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - Batch Photo Classifier +// Uses GhostNetV2_100 model (224x224 input, 1000-class ImageNet output) +// Output feature name: "var_2336" + +struct ClassifiedImage: Identifiable { + let id = UUID() + let image: UIImage + var topLabel: String = "Processing..." + var confidence: Float = 0 + var topResults: [(label: String, score: Float)] = [] + var isProcessing: Bool = true +} + +struct ContentView: View { + @StateObject private var classifier = BatchClassifier() + @State private var showingDetail: ClassifiedImage? + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + if let error = classifier.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(error) + .font(.caption) + } + .padding() + .background(Color(.systemOrange).opacity(0.1)) + } + + if classifier.images.isEmpty { + // Empty state + Spacer() + VStack(spacing: 16) { + Image(systemName: "photo.stack") + .font(.system(size: 60)) + .foregroundColor(.secondary) + Text("Select multiple photos to classify them all at once") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + + PhotosPicker( + selection: $classifier.selectedItems, + maxSelectionCount: 20, + matching: .images + ) { + Label("Select Photos", systemImage: "photo.on.rectangle.angled") + .font(.headline) + .padding() + .frame(maxWidth: 280) + .background(Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + } + .padding() + Spacer() + } else { + // Results grid + ScrollView { + // Summary bar + HStack { + Text("\(classifier.images.count) images") + .font(.subheadline) + .foregroundColor(.secondary) + Spacer() + let done = classifier.images.filter { !$0.isProcessing }.count + if done < classifier.images.count { + ProgressView() + .scaleEffect(0.8) + Text("\(done)/\(classifier.images.count)") + .font(.caption) + .foregroundColor(.secondary) + } else { + Image(systemName: "checkmark.circle.fill") + .foregroundColor(.green) + Text("All classified") + .font(.caption) + .foregroundColor(.secondary) + } + } + .padding(.horizontal) + .padding(.top, 8) + + LazyVGrid(columns: [ + GridItem(.flexible(), spacing: 8), + GridItem(.flexible(), spacing: 8), + GridItem(.flexible(), spacing: 8) + ], spacing: 8) { + ForEach(classifier.images) { item in + ClassifiedImageCell(item: item) + .onTapGesture { + if !item.isProcessing { + showingDetail = item + } + } + } + } + .padding(.horizontal, 8) + .padding(.bottom, 16) + } + } + } + .navigationTitle("GhostNetV2 Batch") + .navigationBarTitleDisplayMode(.large) + .toolbar { + if !classifier.images.isEmpty { + ToolbarItem(placement: .navigationBarLeading) { + Button("Clear") { + classifier.clearAll() + } + } + ToolbarItem(placement: .navigationBarTrailing) { + PhotosPicker( + selection: $classifier.selectedItems, + maxSelectionCount: 20, + matching: .images + ) { + Image(systemName: "plus.circle") + } + } + } + } + .sheet(item: $showingDetail) { item in + DetailSheet(item: item) + } + } + } +} + +// MARK: - Grid Cell +struct ClassifiedImageCell: View { + let item: ClassifiedImage + + var body: some View { + VStack(spacing: 4) { + Image(uiImage: item.image) + .resizable() + .scaledToFill() + .frame(height: 100) + .clipped() + .cornerRadius(8) + + if item.isProcessing { + ProgressView() + .scaleEffect(0.6) + .frame(height: 30) + } else { + Text(item.topLabel) + .font(.caption2) + .fontWeight(.medium) + .lineLimit(1) + Text(String(format: "%.1f%%", item.confidence * 100)) + .font(.caption2) + .foregroundColor(.secondary) + } + } + } +} + +// MARK: - Detail Sheet +struct DetailSheet: View { + let item: ClassifiedImage + @Environment(\.dismiss) private var dismiss + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 16) { + Image(uiImage: item.image) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + + VStack(alignment: .leading, spacing: 8) { + Text("Top Predictions") + .font(.headline) + + ForEach(Array(item.topResults.enumerated()), id: \.offset) { index, result in + HStack { + Text("\(index + 1).") + .font(.caption) + .foregroundColor(.secondary) + .frame(width: 20) + Text(result.label) + .font(.subheadline) + Spacer() + Text(String(format: "%.2f%%", result.score * 100)) + .font(.subheadline) + .foregroundColor(.secondary) + } + ProgressView(value: result.score) + .tint(.accentColor) + } + } + .padding() + } + .padding() + } + .navigationTitle("Classification Detail") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Done") { dismiss() } + } + } + } + } +} + +// MARK: - Batch Classifier ViewModel +@MainActor +class BatchClassifier: ObservableObject { + @Published var images: [ClassifiedImage] = [] + @Published var errorMessage: String? + + @Published var selectedItems: [PhotosPickerItem] = [] { + didSet { Task { await loadImages() } } + } + + private var vnModel: VNCoreMLModel? + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add GhostNetV2_100.mlpackage to the Xcode project. + // The compiled model class will be generated automatically by Xcode. + // Download from the converted_models directory and drag into the project navigator. + do { + guard let modelURL = Bundle.main.url(forResource: "GhostNetV2_100", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Add GhostNetV2_100.mlpackage to the project." + return + } + let mlModel = try MLModel(contentsOf: modelURL) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + private func loadImages() async { + var newImages: [ClassifiedImage] = [] + + for item in selectedItems { + if let data = try? await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) { + newImages.append(ClassifiedImage(image: uiImage)) + } + } + + images = newImages + + // Classify all images concurrently + for index in images.indices { + Task { + await classifyImage(at: index) + } + } + } + + private func classifyImage(at index: Int) async { + guard index < images.count else { return } + guard let vnModel = vnModel else { + if index < images.count { + images[index].isProcessing = false + images[index].topLabel = "No model" + } + return + } + + let image = images[index].image + guard let cgImage = image.cgImage else { + images[index].isProcessing = false + images[index].topLabel = "Invalid image" + return + } + + let request = VNCoreMLRequest(model: vnModel) + request.imageCropAndScaleOption = .centerCrop + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + + do { + try handler.perform([request]) + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. [Float] { + let maxVal = input.max() ?? 0 + let expValues = input.map { exp($0 - maxVal) } + let sumExp = expValues.reduce(0, +) + return expValues.map { $0 / sumExp } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/GhostNetV2DemoApp.swift b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/GhostNetV2DemoApp.swift new file mode 100644 index 0000000..25fcbc4 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/GhostNetV2DemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct GhostNetV2DemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ImageNetLabels.swift b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Info.plist b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Info.plist new file mode 100644 index 0000000..9aad836 --- /dev/null +++ b/sample_apps/GhostNetV2Demo/GhostNetV2Demo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select images for batch classification. + + diff --git a/sample_apps/LRASPPDemo/LRASPPDemo.xcodeproj/project.pbxproj b/sample_apps/LRASPPDemo/LRASPPDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..84a9dd4 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + C30000001 /* LRASPPDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000011 /* LRASPPDemoApp.swift */; }; + C30000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000012 /* ContentView.swift */; }; + C30000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C30000013 /* Assets.xcassets */; }; + C30000004 /* VOCLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = C30000014 /* VOCLabels.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + C30000010 /* LRASPPDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LRASPPDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + C30000011 /* LRASPPDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LRASPPDemoApp.swift; sourceTree = ""; }; + C30000012 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + C30000013 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + C30000014 /* VOCLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VOCLabels.swift; sourceTree = ""; }; + C30000015 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + C30000020 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + C30000030 = { + isa = PBXGroup; + children = ( + C30000031 /* LRASPPDemo */, + C30000032 /* Products */, + ); + sourceTree = ""; + }; + C30000031 /* LRASPPDemo */ = { + isa = PBXGroup; + children = ( + C30000011 /* LRASPPDemoApp.swift */, + C30000012 /* ContentView.swift */, + C30000014 /* VOCLabels.swift */, + C30000013 /* Assets.xcassets */, + C30000015 /* Info.plist */, + ); + path = LRASPPDemo; + sourceTree = ""; + }; + C30000032 /* Products */ = { + isa = PBXGroup; + children = ( + C30000010 /* LRASPPDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + C30000040 /* LRASPPDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = C30000060 /* Build configuration list for PBXNativeTarget "LRASPPDemo" */; + buildPhases = ( + C30000041 /* Sources */, + C30000020 /* Frameworks */, + C30000042 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = LRASPPDemo; + productName = LRASPPDemo; + productReference = C30000010 /* LRASPPDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + C30000050 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + C30000040 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = C30000070 /* Build configuration list for PBXProject "LRASPPDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = C30000030; + productRefGroup = C30000032 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + C30000040 /* LRASPPDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + C30000042 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + C30000041 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C30000001 /* LRASPPDemoApp.swift in Sources */, + C30000002 /* ContentView.swift in Sources */, + C30000004 /* VOCLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + C30000061 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LRASPPDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.lrasppdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + C30000062 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LRASPPDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.lrasppdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + C30000071 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + C30000072 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + C30000060 /* Build configuration list for PBXNativeTarget "LRASPPDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000061 /* Debug */, + C30000062 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C30000070 /* Build configuration list for PBXProject "LRASPPDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C30000071 /* Debug */, + C30000072 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = C30000050 /* Project object */; +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/Contents.json b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/ContentView.swift b/sample_apps/LRASPPDemo/LRASPPDemo/ContentView.swift new file mode 100644 index 0000000..67ffd0c --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/ContentView.swift @@ -0,0 +1,317 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI + +// MARK: - Lightweight Scene Segmentation +// Uses LRASPP_MobileNetV3 model (512x512 input, 1x21x512x512 segmentation map output) +// Output feature name: "var_972" +// 21 Pascal VOC classes + +struct ContentView: View { + @StateObject private var segmenter = SegmentationViewModel() + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + if let error = segmenter.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(error) + .font(.caption) + } + .padding() + .background(Color(.systemOrange).opacity(0.1)) + } + + if let originalImage = segmenter.originalImage { + // Image display area + ZStack { + Image(uiImage: originalImage) + .resizable() + .scaledToFit() + + if let overlayImage = segmenter.overlayImage, segmenter.showOverlay { + Image(uiImage: overlayImage) + .resizable() + .scaledToFit() + .opacity(segmenter.overlayOpacity) + } + } + .frame(maxWidth: .infinity) + .background(Color.black) + + // Controls + VStack(spacing: 12) { + // Overlay toggle + Toggle(isOn: $segmenter.showOverlay) { + Label("Segmentation Overlay", systemImage: "square.stack.3d.up") + } + + if segmenter.showOverlay { + // Opacity slider + HStack { + Text("Opacity") + .font(.caption) + Slider(value: $segmenter.overlayOpacity, in: 0.1...1.0) + Text(String(format: "%.0f%%", segmenter.overlayOpacity * 100)) + .font(.caption) + .frame(width: 40) + } + } + + // Detected classes + if !segmenter.detectedClasses.isEmpty { + VStack(alignment: .leading, spacing: 6) { + Text("Detected Classes") + .font(.headline) + + LazyVGrid(columns: [ + GridItem(.flexible()), + GridItem(.flexible()), + GridItem(.flexible()) + ], spacing: 6) { + ForEach(segmenter.detectedClasses, id: \.index) { cls in + HStack(spacing: 4) { + Circle() + .fill(VOCLabels.color(for: cls.index)) + .frame(width: 10, height: 10) + Text(cls.name) + .font(.caption2) + .lineLimit(1) + Spacer() + Text(String(format: "%.0f%%", cls.percentage)) + .font(.caption2) + .foregroundColor(.secondary) + } + } + } + } + } + + if segmenter.isProcessing { + ProgressView("Segmenting image...") + } + } + .padding() + + Spacer() + } else { + // Empty state + Spacer() + VStack(spacing: 16) { + Image(systemName: "square.stack.3d.down.right") + .font(.system(size: 60)) + .foregroundColor(.secondary) + Text("Select a photo to perform\nscene segmentation") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + + PhotosPicker( + selection: $segmenter.selectedItem, + matching: .images + ) { + Label("Select Photo", systemImage: "photo") + .font(.headline) + .padding() + .frame(maxWidth: 280) + .background(Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + } + Spacer() + } + } + .navigationTitle("LRASPP Segmentation") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + if segmenter.originalImage != nil { + ToolbarItem(placement: .navigationBarTrailing) { + PhotosPicker( + selection: $segmenter.selectedItem, + matching: .images + ) { + Image(systemName: "photo.badge.plus") + } + } + } + } + } + } +} + +// MARK: - Detected Class Info +struct DetectedClass { + let index: Int + let name: String + let percentage: Double // percentage of pixels +} + +// MARK: - Segmentation ViewModel +@MainActor +class SegmentationViewModel: ObservableObject { + @Published var originalImage: UIImage? + @Published var overlayImage: UIImage? + @Published var showOverlay = true + @Published var overlayOpacity: Double = 0.5 + @Published var isProcessing = false + @Published var errorMessage: String? + @Published var detectedClasses: [DetectedClass] = [] + + @Published var selectedItem: PhotosPickerItem? { + didSet { Task { await loadAndSegment() } } + } + + private var vnModel: VNCoreMLModel? + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add LRASPP_MobileNetV3.mlpackage to the Xcode project. + // The compiled model class will be generated automatically by Xcode. + // Download from the converted_models directory and drag into the project navigator. + do { + guard let modelURL = Bundle.main.url(forResource: "LRASPP_MobileNetV3", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Add LRASPP_MobileNetV3.mlpackage to the project." + return + } + let mlModel = try MLModel(contentsOf: modelURL) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + private func loadAndSegment() async { + guard let item = selectedItem, + let data = try? await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) else { return } + + originalImage = uiImage + overlayImage = nil + detectedClasses = [] + isProcessing = true + + await performSegmentation(on: uiImage) + } + + private func performSegmentation(on image: UIImage) async { + guard let vnModel = vnModel else { + isProcessing = false + return + } + + guard let cgImage = image.cgImage else { + isProcessing = false + return + } + + let request = VNCoreMLRequest(model: vnModel) + request.imageCropAndScaleOption = .scaleFill + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + + do { + try handler.perform([request]) + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + // Output shape: 1 x 21 x 512 x 512 + processSegmentationOutput(multiArray: multiArray, originalSize: image.size) + } + } catch { + errorMessage = "Segmentation failed: \(error.localizedDescription)" + } + + isProcessing = false + } + + private func processSegmentationOutput(multiArray: MLMultiArray, originalSize: CGSize) { + let numClasses = 21 + let height = 512 + let width = 512 + let totalPixels = height * width + + // Find argmax class for each pixel + var classMap = [Int](repeating: 0, count: totalPixels) + var classCounts = [Int](repeating: 0, count: numClasses) + + for y in 0.. maxVal { + maxVal = val + maxClass = c + } + } + + let pixelIndex = y * width + x + classMap[pixelIndex] = maxClass + classCounts[maxClass] += 1 + } + } + + // Build overlay image + var pixelData = [UInt8](repeating: 0, count: totalPixels * 4) // RGBA + + for i in 0.. 0.5 { // Only show classes with > 0.5% coverage + detected.append(DetectedClass( + index: c, + name: VOCLabels.name(for: c), + percentage: pct + )) + } + } + detectedClasses = detected.sorted { $0.percentage > $1.percentage } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/Info.plist b/sample_apps/LRASPPDemo/LRASPPDemo/Info.plist new file mode 100644 index 0000000..bc69468 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select images for segmentation. + + diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/LRASPPDemoApp.swift b/sample_apps/LRASPPDemo/LRASPPDemo/LRASPPDemoApp.swift new file mode 100644 index 0000000..75b79c5 --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/LRASPPDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct LRASPPDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/LRASPPDemo/LRASPPDemo/VOCLabels.swift b/sample_apps/LRASPPDemo/LRASPPDemo/VOCLabels.swift new file mode 100644 index 0000000..79f33aa --- /dev/null +++ b/sample_apps/LRASPPDemo/LRASPPDemo/VOCLabels.swift @@ -0,0 +1,56 @@ +import Foundation +import SwiftUI + +// MARK: - Pascal VOC Segmentation Labels +// 21 classes used by LRASPP_MobileNetV3 semantic segmentation model + +struct VOCLabels { + struct SegmentationClass { + let index: Int + let name: String + let color: Color + let rgbColor: (UInt8, UInt8, UInt8) + } + + static let classes: [SegmentationClass] = [ + SegmentationClass(index: 0, name: "Background", color: .black, rgbColor: (0, 0, 0)), + SegmentationClass(index: 1, name: "Aeroplane", color: .red, rgbColor: (128, 0, 0)), + SegmentationClass(index: 2, name: "Bicycle", color: .green, rgbColor: (0, 128, 0)), + SegmentationClass(index: 3, name: "Bird", color: .blue, rgbColor: (128, 128, 0)), + SegmentationClass(index: 4, name: "Boat", color: .yellow, rgbColor: (0, 0, 128)), + SegmentationClass(index: 5, name: "Bottle", color: .purple, rgbColor: (128, 0, 128)), + SegmentationClass(index: 6, name: "Bus", color: .orange, rgbColor: (0, 128, 128)), + SegmentationClass(index: 7, name: "Car", color: .cyan, rgbColor: (128, 128, 128)), + SegmentationClass(index: 8, name: "Cat", color: .mint, rgbColor: (64, 0, 0)), + SegmentationClass(index: 9, name: "Chair", color: .teal, rgbColor: (192, 0, 0)), + SegmentationClass(index: 10, name: "Cow", color: .indigo, rgbColor: (64, 128, 0)), + SegmentationClass(index: 11, name: "Dining Table", color: .brown, rgbColor: (192, 128, 0)), + SegmentationClass(index: 12, name: "Dog", color: Color(red: 1.0, green: 0.4, blue: 0.4), rgbColor: (64, 0, 128)), + SegmentationClass(index: 13, name: "Horse", color: Color(red: 0.4, green: 1.0, blue: 0.4), rgbColor: (192, 0, 128)), + SegmentationClass(index: 14, name: "Motorbike", color: Color(red: 0.4, green: 0.4, blue: 1.0), rgbColor: (64, 128, 128)), + SegmentationClass(index: 15, name: "Person", color: Color(red: 1.0, green: 0.0, blue: 0.5), rgbColor: (192, 128, 128)), + SegmentationClass(index: 16, name: "Potted Plant", color: Color(red: 0.5, green: 1.0, blue: 0.0), rgbColor: (0, 64, 0)), + SegmentationClass(index: 17, name: "Sheep", color: Color(red: 0.0, green: 0.5, blue: 1.0), rgbColor: (128, 64, 0)), + SegmentationClass(index: 18, name: "Sofa", color: Color(red: 0.8, green: 0.8, blue: 0.0), rgbColor: (0, 192, 0)), + SegmentationClass(index: 19, name: "Train", color: Color(red: 0.0, green: 0.8, blue: 0.8), rgbColor: (128, 192, 0)), + SegmentationClass(index: 20, name: "TV/Monitor", color: Color(red: 0.8, green: 0.0, blue: 0.8), rgbColor: (0, 64, 128)) + ] + + /// Get name for a class index + static func name(for index: Int) -> String { + guard index >= 0 && index < classes.count else { return "Unknown" } + return classes[index].name + } + + /// Get color for a class index + static func color(for index: Int) -> Color { + guard index >= 0 && index < classes.count else { return .gray } + return classes[index].color + } + + /// Get RGB color tuple for a class index + static func rgbColor(for index: Int) -> (UInt8, UInt8, UInt8) { + guard index >= 0 && index < classes.count else { return (128, 128, 128) } + return classes[index].rgbColor + } +} diff --git a/sample_apps/LeViTDemo/LeViTDemo.xcodeproj/project.pbxproj b/sample_apps/LeViTDemo/LeViTDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..b6f1e7e --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo.xcodeproj/project.pbxproj @@ -0,0 +1,344 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + E50000001 /* LeViTDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000011 /* LeViTDemoApp.swift */; }; + E50000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000012 /* ContentView.swift */; }; + E50000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = E50000013 /* Assets.xcassets */; }; + E50000004 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000014 /* ImageNetLabels.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + E50000010 /* LeViTDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LeViTDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + E50000011 /* LeViTDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LeViTDemoApp.swift; sourceTree = ""; }; + E50000012 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + E50000013 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + E50000014 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + E50000015 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + E50000020 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + E50000030 = { + isa = PBXGroup; + children = ( + E50000031 /* LeViTDemo */, + E50000032 /* Products */, + ); + sourceTree = ""; + }; + E50000031 /* LeViTDemo */ = { + isa = PBXGroup; + children = ( + E50000011 /* LeViTDemoApp.swift */, + E50000012 /* ContentView.swift */, + E50000014 /* ImageNetLabels.swift */, + E50000013 /* Assets.xcassets */, + E50000015 /* Info.plist */, + ); + path = LeViTDemo; + sourceTree = ""; + }; + E50000032 /* Products */ = { + isa = PBXGroup; + children = ( + E50000010 /* LeViTDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + E50000040 /* LeViTDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = E50000060 /* Build configuration list for PBXNativeTarget "LeViTDemo" */; + buildPhases = ( + E50000041 /* Sources */, + E50000020 /* Frameworks */, + E50000042 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = LeViTDemo; + productName = LeViTDemo; + productReference = E50000010 /* LeViTDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + E50000050 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + E50000040 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = E50000070 /* Build configuration list for PBXProject "LeViTDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = E50000030; + productRefGroup = E50000032 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + E50000040 /* LeViTDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + E50000042 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + E50000041 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000001 /* LeViTDemoApp.swift in Sources */, + E50000002 /* ContentView.swift in Sources */, + E50000004 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + E50000061 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LeViTDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for continuous real-time image classification."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.levitdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + E50000062 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = LeViTDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for continuous real-time image classification."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.levitdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + E50000071 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + E50000072 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + E50000060 /* Build configuration list for PBXNativeTarget "LeViTDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000061 /* Debug */, + E50000062 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + E50000070 /* Build configuration list for PBXProject "LeViTDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000071 /* Debug */, + E50000072 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = E50000050 /* Project object */; +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/Contents.json b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/ContentView.swift b/sample_apps/LeViTDemo/LeViTDemo/ContentView.swift new file mode 100644 index 0000000..93ad1f8 --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/ContentView.swift @@ -0,0 +1,408 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import AVFoundation + +// MARK: - Continuous Camera Classifier with History +// Uses LeViT_128S model (224x224 input, 1000-class ImageNet output) +// Output feature name: "var_1140" + +struct ClassificationEntry: Identifiable { + let id = UUID() + let label: String + let confidence: Float + let timestamp: Date +} + +struct ContentView: View { + @StateObject private var classifier = CameraClassifier() + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + if let error = classifier.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(error) + .font(.caption) + } + .padding(8) + .background(Color(.systemOrange).opacity(0.1)) + } + + // Camera preview area + ZStack(alignment: .bottom) { + CameraPreviewView(session: classifier.captureSession) + .frame(height: 320) + .clipped() + .background(Color.black) + + // Current prediction overlay + if let current = classifier.currentPrediction { + HStack { + VStack(alignment: .leading, spacing: 2) { + Text(current.label) + .font(.title3) + .fontWeight(.bold) + .foregroundColor(.white) + Text(String(format: "%.1f%% confidence", current.confidence * 100)) + .font(.caption) + .foregroundColor(.white.opacity(0.8)) + } + Spacer() + // FPS indicator + Text(String(format: "%.1f fps", classifier.fps)) + .font(.caption2) + .foregroundColor(.white.opacity(0.6)) + .padding(4) + .background(Color.black.opacity(0.3)) + .cornerRadius(4) + } + .padding() + .background( + LinearGradient( + colors: [.clear, .black.opacity(0.7)], + startPoint: .top, + endPoint: .bottom + ) + ) + } + } + + // Controls bar + HStack { + Button { + classifier.toggleCamera() + } label: { + Label( + classifier.isRunning ? "Pause" : "Resume", + systemImage: classifier.isRunning ? "pause.circle.fill" : "play.circle.fill" + ) + .font(.subheadline) + } + .buttonStyle(.bordered) + + Spacer() + + Text("\(classifier.history.count) classifications") + .font(.caption) + .foregroundColor(.secondary) + + Spacer() + + Button { + classifier.clearHistory() + } label: { + Label("Clear", systemImage: "trash") + .font(.subheadline) + } + .buttonStyle(.bordered) + .tint(.red) + } + .padding(.horizontal) + .padding(.vertical, 8) + + // Classification history log + List { + ForEach(classifier.history) { entry in + HStack { + VStack(alignment: .leading, spacing: 2) { + Text(entry.label) + .font(.subheadline) + .fontWeight(.medium) + Text(entry.timestamp, style: .time) + .font(.caption2) + .foregroundColor(.secondary) + } + + Spacer() + + // Confidence bar + VStack(alignment: .trailing, spacing: 2) { + Text(String(format: "%.1f%%", entry.confidence * 100)) + .font(.caption) + .foregroundColor(.secondary) + ProgressView(value: entry.confidence) + .frame(width: 60) + .tint(confidenceColor(entry.confidence)) + } + } + } + } + .listStyle(.plain) + } + .navigationTitle("LeViT Live") + .navigationBarTitleDisplayMode(.inline) + .onAppear { + classifier.startSession() + } + .onDisappear { + classifier.stopSession() + } + } + } + + private func confidenceColor(_ value: Float) -> Color { + if value > 0.7 { return .green } + if value > 0.4 { return .orange } + return .red + } +} + +// MARK: - Camera Preview UIViewRepresentable +struct CameraPreviewView: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Camera Classifier ViewModel +class CameraClassifier: NSObject, ObservableObject, AVCaptureVideoDataOutputSampleBufferDelegate { + @Published var currentPrediction: ClassificationEntry? + @Published var history: [ClassificationEntry] = [] + @Published var isRunning = false + @Published var errorMessage: String? + @Published var fps: Double = 0 + + let captureSession = AVCaptureSession() + private var vnModel: VNCoreMLModel? + private var lastClassificationTime: Date = .distantPast + private var frameCount = 0 + private var fpsTimer: Date = Date() + private let classificationInterval: TimeInterval = 0.5 // Classify every 0.5 seconds + private let maxHistoryCount = 100 + + override init() { + super.init() + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add LeViT_128S.mlpackage to the Xcode project. + // The compiled model class will be generated automatically by Xcode. + // Download from the converted_models directory and drag into the project navigator. + do { + guard let modelURL = Bundle.main.url(forResource: "LeViT_128S", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Add LeViT_128S.mlpackage to the project." + } + return + } + let mlModel = try MLModel(contentsOf: modelURL) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func startSession() { + guard !captureSession.isRunning else { return } + + switch AVCaptureDevice.authorizationStatus(for: .video) { + case .authorized: + setupCamera() + case .notDetermined: + AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + if granted { + DispatchQueue.main.async { + self?.setupCamera() + } + } else { + DispatchQueue.main.async { + self?.errorMessage = "Camera access denied. Enable in Settings." + } + } + } + default: + DispatchQueue.main.async { + self.errorMessage = "Camera access denied. Enable in Settings." + } + } + } + + private func setupCamera() { + captureSession.beginConfiguration() + captureSession.sessionPreset = .medium + + guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: camera) else { + DispatchQueue.main.async { + self.errorMessage = "Cannot access camera." + } + captureSession.commitConfiguration() + return + } + + if captureSession.canAddInput(input) { + captureSession.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "com.coreml-models.levitdemo.camera")) + output.alwaysDiscardsLateVideoFrames = true + + if captureSession.canAddOutput(output) { + captureSession.addOutput(output) + } + + captureSession.commitConfiguration() + + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + self?.captureSession.startRunning() + DispatchQueue.main.async { + self?.isRunning = true + } + } + } + + func stopSession() { + if captureSession.isRunning { + captureSession.stopRunning() + } + DispatchQueue.main.async { + self.isRunning = false + } + } + + func toggleCamera() { + if isRunning { + stopSession() + } else { + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + self?.captureSession.startRunning() + DispatchQueue.main.async { + self?.isRunning = true + } + } + } + } + + func clearHistory() { + history.removeAll() + } + + // MARK: - AVCaptureVideoDataOutputSampleBufferDelegate + + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + // FPS calculation + frameCount += 1 + let now = Date() + let elapsed = now.timeIntervalSince(fpsTimer) + if elapsed >= 1.0 { + let currentFPS = Double(frameCount) / elapsed + DispatchQueue.main.async { + self.fps = currentFPS + } + frameCount = 0 + fpsTimer = now + } + + // Throttle classification + guard now.timeIntervalSince(lastClassificationTime) >= classificationInterval else { return } + lastClassificationTime = now + + guard let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + self?.processResults(request: request, error: error) + } + request.imageCropAndScaleOption = .centerCrop + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + } + + private func processResults(request: VNRequest, error: Error?) { + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. 0.1 { + self.history.insert(entry, at: 0) + } + } else { + self.history.insert(entry, at: 0) + } + + // Trim history + if self.history.count > self.maxHistoryCount { + self.history = Array(self.history.prefix(self.maxHistoryCount)) + } + } + } + } else if let results = request.results as? [VNClassificationObservation], + let top = results.first { + let entry = ClassificationEntry( + label: top.identifier, + confidence: top.confidence, + timestamp: Date() + ) + DispatchQueue.main.async { + self.currentPrediction = entry + if let lastEntry = self.history.first { + if lastEntry.label != entry.label || abs(lastEntry.confidence - entry.confidence) > 0.1 { + self.history.insert(entry, at: 0) + } + } else { + self.history.insert(entry, at: 0) + } + if self.history.count > self.maxHistoryCount { + self.history = Array(self.history.prefix(self.maxHistoryCount)) + } + } + } + } + + private func softmax(_ input: [Float]) -> [Float] { + let maxVal = input.max() ?? 0 + let expValues = input.map { exp($0 - maxVal) } + let sumExp = expValues.reduce(0, +) + return expValues.map { $0 / sumExp } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/ImageNetLabels.swift b/sample_apps/LeViTDemo/LeViTDemo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/LeViTDemo/LeViTDemo/Info.plist b/sample_apps/LeViTDemo/LeViTDemo/Info.plist new file mode 100644 index 0000000..cc12cca --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSCameraUsageDescription + This app needs camera access for continuous real-time image classification. + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select images for classification. + + diff --git a/sample_apps/LeViTDemo/LeViTDemo/LeViTDemoApp.swift b/sample_apps/LeViTDemo/LeViTDemo/LeViTDemoApp.swift new file mode 100644 index 0000000..f71a36f --- /dev/null +++ b/sample_apps/LeViTDemo/LeViTDemo/LeViTDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct LeViTDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo.xcodeproj/project.pbxproj b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..a138f35 --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo.xcodeproj/project.pbxproj @@ -0,0 +1,344 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + A10000010000000000000001 /* MobileNetV3SmallDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000001 /* MobileNetV3SmallDemoApp.swift */; }; + A10000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000002 /* ContentView.swift */; }; + A10000010000000000000003 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000003 /* ImageNetLabels.swift */; }; + A10000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + A10000020000000000000001 /* MobileNetV3SmallDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNetV3SmallDemoApp.swift; sourceTree = ""; }; + A10000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + A10000020000000000000003 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + A10000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + A10000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + A10000020000000000000010 /* MobileNetV3SmallDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MobileNetV3SmallDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + A10000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + A10000040000000000000001 = { + isa = PBXGroup; + children = ( + A10000040000000000000002 /* MobileNetV3SmallDemo */, + A10000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + A10000040000000000000002 /* MobileNetV3SmallDemo */ = { + isa = PBXGroup; + children = ( + A10000020000000000000001 /* MobileNetV3SmallDemoApp.swift */, + A10000020000000000000002 /* ContentView.swift */, + A10000020000000000000003 /* ImageNetLabels.swift */, + A10000020000000000000004 /* Assets.xcassets */, + A10000020000000000000005 /* Info.plist */, + ); + path = MobileNetV3SmallDemo; + sourceTree = ""; + }; + A10000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + A10000020000000000000010 /* MobileNetV3SmallDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + A10000050000000000000001 /* MobileNetV3SmallDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = A10000070000000000000001 /* Build configuration list for PBXNativeTarget "MobileNetV3SmallDemo" */; + buildPhases = ( + A10000060000000000000001 /* Sources */, + A10000030000000000000001 /* Frameworks */, + A10000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = MobileNetV3SmallDemo; + productName = MobileNetV3SmallDemo; + productReference = A10000020000000000000010 /* MobileNetV3SmallDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + A10000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + A10000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = A10000070000000000000003 /* Build configuration list for PBXProject "MobileNetV3SmallDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = A10000040000000000000001; + productRefGroup = A10000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + A10000050000000000000001 /* MobileNetV3SmallDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + A10000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + A10000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + A10000010000000000000001 /* MobileNetV3SmallDemoApp.swift in Sources */, + A10000010000000000000002 /* ContentView.swift in Sources */, + A10000010000000000000003 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + A10000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + A10000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + A10000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MobileNetV3SmallDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time image classification."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.mobilenetv3small"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + A10000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MobileNetV3SmallDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time image classification."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.mobilenetv3small"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + A10000070000000000000001 /* Build configuration list for PBXNativeTarget "MobileNetV3SmallDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000090000000000000003 /* Debug */, + A10000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + A10000070000000000000003 /* Build configuration list for PBXProject "MobileNetV3SmallDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + A10000090000000000000001 /* Debug */, + A10000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = A10000080000000000000001 /* Project object */; +} diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/Contents.json b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/ContentView.swift b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/ContentView.swift new file mode 100644 index 0000000..6fcd462 --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/ContentView.swift @@ -0,0 +1,241 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Vision + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .high + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Classifier + +class MobileNetClassifier: ObservableObject { + @Published var predictions: [(label: String, confidence: Float)] = [] + @Published var errorMessage: String? + + private var vnModel: VNCoreMLModel? + private var isProcessing = false + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add MobileNetV3Small.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "MobileNetV3Small", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add MobileNetV3Small.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func classify(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + defer { self?.isProcessing = false } + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processResults(multiArray: multiArray) + } else if let results = request.results as? [VNClassificationObservation] { + let top3 = results.prefix(3).map { (label: $0.identifier, confidence: $0.confidence) } + DispatchQueue.main.async { + self?.predictions = top3 + } + } + } + request.imageCropAndScaleOption = .centerCrop + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + } + + private func processResults(multiArray: MLMultiArray) { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Info.plist b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Info.plist new file mode 100644 index 0000000..e87e3cc --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time image classification. + + diff --git a/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/MobileNetV3SmallDemoApp.swift b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/MobileNetV3SmallDemoApp.swift new file mode 100644 index 0000000..990e27f --- /dev/null +++ b/sample_apps/MobileNetV3SmallDemo/MobileNetV3SmallDemo/MobileNetV3SmallDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct MobileNetV3SmallDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo.xcodeproj/project.pbxproj b/sample_apps/MobileOneDemo/MobileOneDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..559c3b7 --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo.xcodeproj/project.pbxproj @@ -0,0 +1,344 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + E50000010000000000000001 /* MobileOneDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000001 /* MobileOneDemoApp.swift */; }; + E50000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000002 /* ContentView.swift */; }; + E50000010000000000000003 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000003 /* ImageNetLabels.swift */; }; + E50000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = E50000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + E50000020000000000000001 /* MobileOneDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileOneDemoApp.swift; sourceTree = ""; }; + E50000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + E50000020000000000000003 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + E50000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + E50000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + E50000020000000000000010 /* MobileOneDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MobileOneDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + E50000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + E50000040000000000000001 = { + isa = PBXGroup; + children = ( + E50000040000000000000002 /* MobileOneDemo */, + E50000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + E50000040000000000000002 /* MobileOneDemo */ = { + isa = PBXGroup; + children = ( + E50000020000000000000001 /* MobileOneDemoApp.swift */, + E50000020000000000000002 /* ContentView.swift */, + E50000020000000000000003 /* ImageNetLabels.swift */, + E50000020000000000000004 /* Assets.xcassets */, + E50000020000000000000005 /* Info.plist */, + ); + path = MobileOneDemo; + sourceTree = ""; + }; + E50000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + E50000020000000000000010 /* MobileOneDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + E50000050000000000000001 /* MobileOneDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = E50000070000000000000001 /* Build configuration list for PBXNativeTarget "MobileOneDemo" */; + buildPhases = ( + E50000060000000000000001 /* Sources */, + E50000030000000000000001 /* Frameworks */, + E50000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = MobileOneDemo; + productName = MobileOneDemo; + productReference = E50000020000000000000010 /* MobileOneDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + E50000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + E50000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = E50000070000000000000003 /* Build configuration list for PBXProject "MobileOneDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = E50000040000000000000001; + productRefGroup = E50000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + E50000050000000000000001 /* MobileOneDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + E50000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + E50000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E50000010000000000000001 /* MobileOneDemoApp.swift in Sources */, + E50000010000000000000002 /* ContentView.swift in Sources */, + E50000010000000000000003 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + E50000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + E50000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + E50000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MobileOneDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time image classification with FPS monitoring."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.mobileone"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + E50000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = MobileOneDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time image classification with FPS monitoring."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.mobileone"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + E50000070000000000000001 /* Build configuration list for PBXNativeTarget "MobileOneDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000090000000000000003 /* Debug */, + E50000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + E50000070000000000000003 /* Build configuration list for PBXProject "MobileOneDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E50000090000000000000001 /* Debug */, + E50000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = E50000080000000000000001 /* Project object */; +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/Contents.json b/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/ContentView.swift b/sample_apps/MobileOneDemo/MobileOneDemo/ContentView.swift new file mode 100644 index 0000000..9c8727c --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/ContentView.swift @@ -0,0 +1,379 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Vision + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .medium + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + output.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA] + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { Coordinator() } + class Coordinator { var previewLayer: AVCaptureVideoPreviewLayer? } +} + +// MARK: - FPS Counter + +class FPSCounter: ObservableObject { + @Published var fps: Double = 0 + @Published var inferenceMs: Double = 0 + @Published var peakFps: Double = 0 + @Published var minInferenceMs: Double = Double.infinity + + private var frameTimestamps: [CFAbsoluteTime] = [] + private var inferenceTimes: [Double] = [] + private let windowSize = 30 + + func recordFrame(inferenceTime: Double) { + let now = CFAbsoluteTimeGetCurrent() + frameTimestamps.append(now) + inferenceTimes.append(inferenceTime) + + // Keep only recent frames + while frameTimestamps.count > windowSize { + frameTimestamps.removeFirst() + } + while inferenceTimes.count > windowSize { + inferenceTimes.removeFirst() + } + + // Calculate FPS from frame timestamps + if frameTimestamps.count >= 2 { + let duration = frameTimestamps.last! - frameTimestamps.first! + let currentFps = duration > 0 ? Double(frameTimestamps.count - 1) / duration : 0 + DispatchQueue.main.async { [weak self] in + self?.fps = currentFps + if currentFps > (self?.peakFps ?? 0) { + self?.peakFps = currentFps + } + } + } + + // Average inference time + let avgInference = inferenceTimes.reduce(0, +) / Double(inferenceTimes.count) + DispatchQueue.main.async { [weak self] in + self?.inferenceMs = avgInference + if inferenceTime < (self?.minInferenceMs ?? Double.infinity) { + self?.minInferenceMs = inferenceTime + } + } + } +} + +// MARK: - MobileOne Classifier + +class MobileOneClassifier: ObservableObject { + @Published var predictions: [(label: String, confidence: Float)] = [] + @Published var errorMessage: String? + + private var vnModel: VNCoreMLModel? + private var isProcessing = false + + let fpsCounter = FPSCounter() + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add MobileOne_S0.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "MobileOne_S0", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add MobileOne_S0.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func classify(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + defer { self?.isProcessing = false } + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self?.processResults(multiArray: multiArray) + } else if let results = request.results as? [VNClassificationObservation] { + let top3 = results.prefix(3).map { (label: $0.identifier, confidence: $0.confidence) } + DispatchQueue.main.async { + self?.predictions = top3 + } + } + } + request.imageCropAndScaleOption = .centerCrop + + let startTime = CFAbsoluteTimeGetCurrent() + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + let elapsed = (CFAbsoluteTimeGetCurrent() - startTime) * 1000.0 + + fpsCounter.recordFrame(inferenceTime: elapsed) + } + + private func processResults(multiArray: MLMultiArray) { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0..= 30 { return .green } + if fps >= 15 { return .yellow } + return .red + } + + private func rankColor(_ index: Int) -> Color { + switch index { + case 0: return .green + case 1: return .cyan + case 2: return .orange + default: return .gray + } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/ImageNetLabels.swift b/sample_apps/MobileOneDemo/MobileOneDemo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/Info.plist b/sample_apps/MobileOneDemo/MobileOneDemo/Info.plist new file mode 100644 index 0000000..2e3245c --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time image classification with FPS monitoring. + + diff --git a/sample_apps/MobileOneDemo/MobileOneDemo/MobileOneDemoApp.swift b/sample_apps/MobileOneDemo/MobileOneDemo/MobileOneDemoApp.swift new file mode 100644 index 0000000..ece2c64 --- /dev/null +++ b/sample_apps/MobileOneDemo/MobileOneDemo/MobileOneDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct MobileOneDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo.xcodeproj/project.pbxproj b/sample_apps/PoolFormerDemo/PoolFormerDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..1fc251b --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + D40000001 /* PoolFormerDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000011 /* PoolFormerDemoApp.swift */; }; + D40000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000012 /* ContentView.swift */; }; + D40000003 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = D40000013 /* Assets.xcassets */; }; + D40000004 /* ImageNetLabels.swift in Sources */ = {isa = PBXBuildFile; fileRef = D40000014 /* ImageNetLabels.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + D40000010 /* PoolFormerDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PoolFormerDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + D40000011 /* PoolFormerDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolFormerDemoApp.swift; sourceTree = ""; }; + D40000012 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + D40000013 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + D40000014 /* ImageNetLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageNetLabels.swift; sourceTree = ""; }; + D40000015 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + D40000020 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + D40000030 = { + isa = PBXGroup; + children = ( + D40000031 /* PoolFormerDemo */, + D40000032 /* Products */, + ); + sourceTree = ""; + }; + D40000031 /* PoolFormerDemo */ = { + isa = PBXGroup; + children = ( + D40000011 /* PoolFormerDemoApp.swift */, + D40000012 /* ContentView.swift */, + D40000014 /* ImageNetLabels.swift */, + D40000013 /* Assets.xcassets */, + D40000015 /* Info.plist */, + ); + path = PoolFormerDemo; + sourceTree = ""; + }; + D40000032 /* Products */ = { + isa = PBXGroup; + children = ( + D40000010 /* PoolFormerDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + D40000040 /* PoolFormerDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = D40000060 /* Build configuration list for PBXNativeTarget "PoolFormerDemo" */; + buildPhases = ( + D40000041 /* Sources */, + D40000020 /* Frameworks */, + D40000042 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = PoolFormerDemo; + productName = PoolFormerDemo; + productReference = D40000010 /* PoolFormerDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + D40000050 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + D40000040 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = D40000070 /* Build configuration list for PBXProject "PoolFormerDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = D40000030; + productRefGroup = D40000032 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + D40000040 /* PoolFormerDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + D40000042 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000003 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + D40000041 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + D40000001 /* PoolFormerDemoApp.swift in Sources */, + D40000002 /* ContentView.swift in Sources */, + D40000004 /* ImageNetLabels.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + D40000061 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = PoolFormerDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.poolformerdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + D40000062 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = PoolFormerDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.poolformerdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + D40000071 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + D40000072 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + D40000060 /* Build configuration list for PBXNativeTarget "PoolFormerDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000061 /* Debug */, + D40000062 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + D40000070 /* Build configuration list for PBXProject "PoolFormerDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + D40000071 /* Debug */, + D40000072 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = D40000050 /* Project object */; +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/Contents.json b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/ContentView.swift b/sample_apps/PoolFormerDemo/PoolFormerDemo/ContentView.swift new file mode 100644 index 0000000..c03c365 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/ContentView.swift @@ -0,0 +1,377 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI +import UniformTypeIdentifiers + +// MARK: - Drag-and-Drop Image Classifier +// Uses PoolFormer_S12 model (224x224 input, 1000-class ImageNet output) +// Output feature name: "var_646" + +struct ContentView: View { + @StateObject private var classifier = DropClassifier() + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + if let error = classifier.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(error) + .font(.caption) + } + .padding() + .background(Color(.systemOrange).opacity(0.1)) + } + + if classifier.classifiedItems.isEmpty && classifier.droppedImage == nil { + // Drop zone + VStack(spacing: 0) { + Spacer() + DropZoneView( + isTargeted: $classifier.isDropTargeted, + classifier: classifier + ) + .padding() + + // Also allow photo picker as fallback + PhotosPicker( + selection: $classifier.selectedItem, + matching: .images + ) { + Label("Or pick from Photos", systemImage: "photo.on.rectangle") + .font(.subheadline) + } + .padding(.bottom, 8) + + Text("On iPad, drag photos from Files or Safari onto the drop zone") + .font(.caption) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + .padding(.horizontal) + Spacer() + } + } else { + // Results view + ScrollView { + VStack(spacing: 16) { + // Current dropped image + if let image = classifier.droppedImage { + VStack(spacing: 12) { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 250) + .cornerRadius(12) + .shadow(radius: 4) + + if classifier.isProcessing { + ProgressView("Classifying...") + } else if !classifier.currentResults.isEmpty { + ResultsCard(results: classifier.currentResults) + } + } + .padding() + } + + // Drop zone for adding more + DropZoneView( + isTargeted: $classifier.isDropTargeted, + classifier: classifier, + compact: true + ) + .padding(.horizontal) + + PhotosPicker( + selection: $classifier.selectedItem, + matching: .images + ) { + Label("Or pick from Photos", systemImage: "photo.on.rectangle") + .font(.caption) + } + + // History + if !classifier.classifiedItems.isEmpty { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Classification History") + .font(.headline) + Spacer() + Button("Clear") { + classifier.clearHistory() + } + .font(.caption) + } + .padding(.horizontal) + + ForEach(classifier.classifiedItems) { item in + HistoryRow(item: item) + .padding(.horizontal) + } + } + } + } + .padding(.vertical) + } + } + } + .navigationTitle("PoolFormer") + .navigationBarTitleDisplayMode(.large) + } + } +} + +// MARK: - Drop Zone View +struct DropZoneView: View { + @Binding var isTargeted: Bool + let classifier: DropClassifier + var compact: Bool = false + + var body: some View { + let height: CGFloat = compact ? 100 : 250 + + RoundedRectangle(cornerRadius: 16) + .strokeBorder( + isTargeted ? Color.accentColor : Color.secondary.opacity(0.3), + style: StrokeStyle(lineWidth: 3, dash: [10]) + ) + .background( + RoundedRectangle(cornerRadius: 16) + .fill(isTargeted ? Color.accentColor.opacity(0.1) : Color(.systemGray6)) + ) + .frame(height: height) + .overlay { + VStack(spacing: 8) { + Image(systemName: isTargeted ? "arrow.down.circle.fill" : "arrow.down.doc") + .font(compact ? .title3 : .largeTitle) + .foregroundColor(isTargeted ? .accentColor : .secondary) + if !compact { + Text("Drop an image here") + .font(.headline) + .foregroundColor(.secondary) + Text("Drag a photo onto this area to classify it") + .font(.caption) + .foregroundColor(.secondary) + } else { + Text("Drop another image") + .font(.caption) + .foregroundColor(.secondary) + } + } + } + .onDrop(of: [UTType.image], isTargeted: $isTargeted) { providers in + classifier.handleDrop(providers: providers) + return true + } + } +} + +// MARK: - Results Card +struct ResultsCard: View { + let results: [(label: String, score: Float)] + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Classification Results") + .font(.headline) + + ForEach(Array(results.enumerated()), id: \.offset) { index, result in + HStack { + Text(result.label) + .font(.subheadline) + Spacer() + Text(String(format: "%.1f%%", result.score * 100)) + .font(.subheadline) + .foregroundColor(.secondary) + } + ProgressView(value: result.score) + .tint(index == 0 ? .accentColor : .gray) + } + } + .padding() + .background(Color(.systemBackground)) + .cornerRadius(12) + .shadow(color: .black.opacity(0.1), radius: 4) + } +} + +// MARK: - History Row +struct HistoryItem: Identifiable { + let id = UUID() + let image: UIImage + let topLabel: String + let confidence: Float + let timestamp: Date +} + +struct HistoryRow: View { + let item: HistoryItem + + var body: some View { + HStack(spacing: 12) { + Image(uiImage: item.image) + .resizable() + .scaledToFill() + .frame(width: 50, height: 50) + .cornerRadius(8) + .clipped() + + VStack(alignment: .leading) { + Text(item.topLabel) + .font(.subheadline) + .fontWeight(.medium) + Text(String(format: "%.1f%%", item.confidence * 100)) + .font(.caption) + .foregroundColor(.secondary) + } + + Spacer() + + Text(item.timestamp, style: .time) + .font(.caption2) + .foregroundColor(.secondary) + } + .padding(8) + .background(Color(.systemGray6)) + .cornerRadius(8) + } +} + +// MARK: - Classifier ViewModel +@MainActor +class DropClassifier: ObservableObject { + @Published var droppedImage: UIImage? + @Published var currentResults: [(label: String, score: Float)] = [] + @Published var classifiedItems: [HistoryItem] = [] + @Published var isProcessing = false + @Published var isDropTargeted = false + @Published var errorMessage: String? + + @Published var selectedItem: PhotosPickerItem? { + didSet { Task { await loadFromPicker() } } + } + + private var vnModel: VNCoreMLModel? + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add PoolFormer_S12.mlpackage to the Xcode project. + // The compiled model class will be generated automatically by Xcode. + // Download from the converted_models directory and drag into the project navigator. + do { + guard let modelURL = Bundle.main.url(forResource: "PoolFormer_S12", withExtension: "mlmodelc") else { + errorMessage = "Model not found. Add PoolFormer_S12.mlpackage to the project." + return + } + let mlModel = try MLModel(contentsOf: modelURL) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + + private func loadFromPicker() async { + guard let item = selectedItem, + let data = try? await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) else { return } + await classifyImage(uiImage) + } + + func handleDrop(providers: [NSItemProvider]) { + guard let provider = providers.first else { return } + + if provider.canLoadObject(ofClass: UIImage.self) { + provider.loadObject(ofClass: UIImage.self) { [weak self] image, error in + guard let self = self, let uiImage = image as? UIImage else { return } + Task { @MainActor in + await self.classifyImage(uiImage) + } + } + } else { + provider.loadDataRepresentation(forTypeIdentifier: UTType.image.identifier) { [weak self] data, error in + guard let self = self, let data = data, let uiImage = UIImage(data: data) else { return } + Task { @MainActor in + await self.classifyImage(uiImage) + } + } + } + } + + private func classifyImage(_ image: UIImage) async { + // Save previous result to history + if let prevImage = droppedImage, !currentResults.isEmpty { + let historyItem = HistoryItem( + image: prevImage, + topLabel: currentResults.first?.label ?? "Unknown", + confidence: currentResults.first?.score ?? 0, + timestamp: Date() + ) + classifiedItems.insert(historyItem, at: 0) + if classifiedItems.count > 20 { + classifiedItems = Array(classifiedItems.prefix(20)) + } + } + + droppedImage = image + currentResults = [] + isProcessing = true + + guard let vnModel = vnModel else { + isProcessing = false + return + } + + guard let cgImage = image.cgImage else { + isProcessing = false + return + } + + let request = VNCoreMLRequest(model: vnModel) + request.imageCropAndScaleOption = .centerCrop + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + + do { + try handler.perform([request]) + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + let count = multiArray.count + var scores = [Float](repeating: 0, count: count) + for i in 0.. [Float] { + let maxVal = input.max() ?? 0 + let expValues = input.map { exp($0 - maxVal) } + let sumExp = expValues.reduce(0, +) + return expValues.map { $0 / sumExp } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/ImageNetLabels.swift b/sample_apps/PoolFormerDemo/PoolFormerDemo/ImageNetLabels.swift new file mode 100644 index 0000000..08f202d --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/ImageNetLabels.swift @@ -0,0 +1,95 @@ +import Foundation + +// MARK: - ImageNet Labels (Condensed Demo Version) +// This file contains a subset of 20 common ImageNet-1K labels for demo purposes. +// For the full 1000-class label list, download from: +// https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt +// and replace this array with all 1000 entries. + +struct ImageNetLabels { + /// Full ImageNet-1K has 1000 labels. This is a condensed demo set. + /// Index positions correspond to the model output indices. + /// Replace with the full list for production use. + static let labels: [Int: String] = [ + 0: "tench", + 1: "goldfish", + 2: "great white shark", + 7: "cock", + 14: "indigo bunting", + 65: "sea snake", + 99: "goose", + 207: "golden retriever", + 208: "Labrador retriever", + 231: "collie", + 235: "German shepherd", + 258: "Samoyed", + 259: "Pomeranian", + 281: "tabby cat", + 282: "tiger cat", + 285: "Egyptian cat", + 291: "lion", + 340: "zebra", + 386: "African elephant", + 409: "analog clock", + 417: "balloon", + 430: "basketball", + 446: "bikini", + 457: "bow tie", + 468: "cab", + 504: "coffee mug", + 508: "computer keyboard", + 531: "digital watch", + 537: "dog sled", + 539: "drum", + 549: "envelope", + 555: "fire truck", + 569: "fountain", + 604: "golf ball", + 609: "grand piano", + 620: "hamburger", + 659: "mixing bowl", + 671: "mountain bike", + 673: "mouse", + 701: "parachute", + 717: "pickup truck", + 737: "pot", + 755: "redbone", + 779: "school bus", + 812: "space shuttle", + 817: "sports car", + 834: "sunglasses", + 849: "tennis ball", + 852: "thatch", + 859: "toaster", + 876: "tray", + 880: "umbrella", + 892: "wall clock", + 907: "wine bottle", + 920: "traffic light", + 934: "hot dog", + 945: "bell pepper", + 947: "mushroom", + 950: "orange", + 954: "banana", + 963: "pizza", + 965: "burrito", + 967: "espresso", + 985: "daisy", + 988: "sunflower", + 999: "toilet tissue" + ] + + /// Get the label for a given class index. + /// Returns "class_{index}" for indices not in the condensed set. + static func label(for index: Int) -> String { + return labels[index] ?? "class_\(index)" + } + + /// Get top-K predictions from a probability/score array. + static func topK(scores: [Float], k: Int = 5) -> [(index: Int, label: String, score: Float)] { + let indexed = scores.enumerated().map { (index: $0.offset, score: $0.element) } + let sorted = indexed.sorted { $0.score > $1.score } + let topK = sorted.prefix(k) + return topK.map { (index: $0.index, label: label(for: $0.index), score: $0.score) } + } +} diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/Info.plist b/sample_apps/PoolFormerDemo/PoolFormerDemo/Info.plist new file mode 100644 index 0000000..3faede7 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs access to your photo library to select images for classification. + + diff --git a/sample_apps/PoolFormerDemo/PoolFormerDemo/PoolFormerDemoApp.swift b/sample_apps/PoolFormerDemo/PoolFormerDemo/PoolFormerDemoApp.swift new file mode 100644 index 0000000..78bd360 --- /dev/null +++ b/sample_apps/PoolFormerDemo/PoolFormerDemo/PoolFormerDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct PoolFormerDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} From 4ee4dbae24cc4e019226c163d5972a38cc264af7 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 29 Mar 2026 00:33:45 +0900 Subject: [PATCH 02/18] Add missing AppIcon.appiconset to all creative apps --- creative_apps/.DS_Store | Bin 0 -> 6148 bytes creative_apps/CDTNetDemo/.DS_Store | Bin 0 -> 6148 bytes .../CDTNetDemo.xcodeproj/project.pbxproj | 37 +++++++++--------- .../contents.xcworkspacedata | 7 ++++ .../UserInterfaceState.xcuserstate | Bin 0 -> 14194 bytes .../xcschemes/xcschememanagement.plist | 14 +++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ .../AppIcon.appiconset/Contents.json | 13 ++++++ 16 files changed, 170 insertions(+), 18 deletions(-) create mode 100644 creative_apps/.DS_Store create mode 100644 creative_apps/CDTNetDemo/.DS_Store create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist create mode 100644 creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AppIcon.appiconset/Contents.json diff --git a/creative_apps/.DS_Store b/creative_apps/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..7ce5517cd73644cdfac19847b3583da131209ee5 GIT binary patch literal 6148 zcmeHK%}N6?5T3Nv?kYkL3VI88Em*A<#mmyxs|Q!~pi;YZ(Z%hibhq|U3VYWV@V&8lV#w`vl=RfNsYXdJ)w#ZLIIiP% zDm#{WCoal?=e8_N&9FlC|Lp7c|9ldUm;q+sPca}$ZNJ^bCE41#usEu<67?RHgyM38pDEbU hr5IzW6mO$yLA#_5qGvHTh!zyS2xuC(VFrGbfe*L$Orih) literal 0 HcmV?d00001 diff --git a/creative_apps/CDTNetDemo/.DS_Store b/creative_apps/CDTNetDemo/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a8620a297b6cdd674588d42a8a5d43ab6498a8d5 GIT binary patch literal 6148 zcmeHKOG*SW5UtW#G`N|Ch`7oDM3@`IF>c+AYmu2A6q%+QQE-_fcmxmN5yY+6@>Qz* zH1=i?M6?Q0ukv~6PL`!O@9hm4h zHp$wz=?JSnr4y|DWBmtoM+@wuMr%jch^|%_li8weI^D?G<2u}?O=-Sr=ipS&`wx3N zPcMh-@#}B#yUWjOzt=6ecLiJlSHKncj|$+<7OS=t-Ma#=fGhB>fSeBjMKBs>#d37O zl@tId&*)-^C6^FRFpP#-5j`+gDp09xDF!PY?ZL)H!>p)uVoN^QzWkQFaQQms4`n!U zRCMnOxB`6zHnlpI`~Mif%wUq=r+Cd3a0ULD0z68yG{vUuZvC=7xoZQ;If|H!%M!p4 l_Z|VL$T_lGI<-GYjB(L0E6Oa=Ug$u72oyowxdK0+z$+6bKkonl literal 0 HcmV?d00001 diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj index 1ff237f..a7d2ac4 100644 --- a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj +++ b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.pbxproj @@ -7,17 +7,17 @@ objects = { /* Begin PBXBuildFile section */ - A10001 /* CDTNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10002; }; - A10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10004; }; - A10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10006; }; - A1CD02 /* CDTNet_Harmonization.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = A1CD01; }; + A10001 /* CDTNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10002 /* CDTNetDemoApp.swift */; }; + A10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A10004 /* ContentView.swift */; }; + A10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A10006 /* Assets.xcassets */; }; + A1CD02 /* CDTNet_Harmonization.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = A1CD01 /* CDTNet_Harmonization.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - A10007 /* CDTNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = CDTNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; A10002 /* CDTNetDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CDTNetDemoApp.swift; sourceTree = ""; }; A10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; A10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + A10007 /* CDTNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = CDTNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; A10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; A1CD01 /* CDTNet_Harmonization.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = CDTNet_Harmonization.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ @@ -66,7 +66,7 @@ /* Begin PBXNativeTarget section */ A10013 /* CDTNetDemo */ = { isa = PBXNativeTarget; - buildConfigurationList = A10014; + buildConfigurationList = A10014 /* Build configuration list for PBXNativeTarget "CDTNetDemo" */; buildPhases = ( A10015 /* Sources */, A10009 /* Frameworks */, @@ -78,7 +78,7 @@ ); name = CDTNetDemo; productName = CDTNetDemo; - productReference = A10007; + productReference = A10007 /* CDTNetDemo.app */; productType = "com.apple.product-type.application"; }; /* End PBXNativeTarget section */ @@ -96,7 +96,7 @@ }; }; }; - buildConfigurationList = A10018; + buildConfigurationList = A10018 /* Build configuration list for PBXProject "CDTNetDemo" */; compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; @@ -105,11 +105,11 @@ Base, ); mainGroup = A10010; - productRefGroup = A10012; + productRefGroup = A10012 /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( - A10013, + A10013 /* CDTNetDemo */, ); }; /* End PBXProject section */ @@ -199,6 +199,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = CDTNetDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -226,6 +227,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = CDTNetDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -249,26 +251,25 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - A10018 /* Build configuration list for PBXProject */ = { + A10014 /* Build configuration list for PBXNativeTarget "CDTNetDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - A10019, - A10020, + A10021 /* Debug */, + A10022 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - A10014 /* Build configuration list for PBXNativeTarget */ = { + A10018 /* Build configuration list for PBXProject "CDTNetDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - A10021, - A10022, + A10019 /* Debug */, + A10020 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; - rootObject = A10017; + rootObject = A10017 /* Project object */; } diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 0000000..919434a --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate new file mode 100644 index 0000000000000000000000000000000000000000..2202d2e2f8e5b4712b5c2b9203f8b2df9ee45f9d GIT binary patch literal 14194 zcmch72Yi#&`u`bilQg?P+NQLGrcKkdN%ullCzR0^(iSL`kTx$RkR~-rfht~3aN%AT z7oxO91Qh`%F2w5|*8zy4BI-|^R~#s!fc~EMeVepJ@ZQgVT>2r&d)~9pGrrIFIc;4Y zcOaOXyB}dh5Jgca8pR+9ik+Lbobw0VKJUCVr@wuP8=e}|g1)Y~X}-3lTzfDOhwx<^ z9U66=eLB~}xylDBy^W+u)?^PlgB-)#wYV6`kpjh|1eAzMP$?=y=sERF4`^GipKe(L%HYxzSRz44scIMVFz=(OPr`x)NQ5u143O8_?XrT8*@IbMsez*pj{@C|r9z5{Q-cjCM7Mtm>6 z58sb>;D_<^_yznTehI&fU%{{9f8sar+xQ6n9Djko#9!gB@i+Ki_*;CGAc6@Ylthte z5nM>xA1*Da5WHIR=OGzj3 zlFP{DWG%UZTuH7XSCebVwd6W-J^4GimE1;dC!5JVWE**iJWL)TkCUg#v*aJ-Me-7P zjl4uCd>MQ77yI)~1u3+N)cn0C-5)I&Sz`7}rw4be69VtNU^p58!jq&LyO(=Bu> zy_eoc@2A`719UsxK_8`$(WmH(^d!|u(eLRG^dvpil;-XB zc=jPBQXw_cpb%aMwEuqkqKq6XcogHOv+>|mdTldDVd6?nTBb% zp-f~!StuJ>kq!RmqCAw33Q!>`VsR{y&0q~|7Ml(K?W~z`tef4)e9Y$1B+Ta8VWQP8 zchKh#PrGUMjDSOT;k`Pt|?BBhx0>!dhva^-|zHxz(l;kDz2@&qt?6F=g??jPL)1y zkXsoPmk8aA6sJ9Wwhm413}+x%>FezBa6!)1z?-i?5PB1rSK$nRmEzY6ib7s11h9wO5d*YBaSE)u1VC1k?oOx6=?-)+<8rEeE4&_` z(-p|6^!d4&GpsXxF3uCkX@^GTc#Ssn;*!GLXLI|wuU*o$VN z*~pF>TSWQxIyvad+PS5ZT9GVibjqq8}022lQv3Gaz0(0hI!|H@r z;6Ywg+>2`1h+gDlDa;mr;YSyu;w>nEf~XrUM=Q`u)PpWyW0--BWvR@_(pWk(Z9%Kh zMd)v6HDV}))}V`72D7j%md&gzhZV3wehcCQ19Dp%@&NAyyn}v zU_ZJEtwT*9knkWjx+T)cp-HZ-sI440zxsjx2Nz(R?Ae3(f8jmdfbK-a8#l6n!Fo|2 zSeeLhRc_FCpV#T{=|{b&A2q>D2Ft>I+>P$x4{S5b?L}Ky9>2e8Ze^F#3sy|n-zv~8 z-l5!&wrynj;Ix&}NF*poJo)4MEEkW*7tP+DDC zR${HJsmTKeRaI#%t*k1u7L-=il$2LhRhO1lhev-F{bTraUO+GL)0x6_z362&Zct`l zLvNwtP3U#>PxJ(FGJrMg2bv%5RIAc>f zSbZxP|2pt*_MjgGDXo}8Ofd?~y7+k|qPDPK{P-tg5Tf5=)`2oU9w&EOEGPbe#y*QUG zV78(Ah6`c!MYxzpGQ1oMqLiRo{|RdqccK)RHG}m8IMjzrBmNqnhbQozj%SWuJdrIL z-YIlhfh%#jpU!mF-iv24mjD<3t9|2)^IyAbW#kGMs zZh)2?Z#y;O3wzb{W~blnGRi3r1va~d>vb?CYy1J%G zIMoH=KJtrgc`fb+{>1Z2v-84R-Gl!IE(c$LFT|_xMQk}+!B(=KO?WkCIE2@*3)qEh z6}yN(S{*3Ba7f?i4)PnC@TV8eBAaoxSM^;yNPun4{Zp%fXaZi_yE{7pM~Kp9=Q{W> zCD0@ULCtw36?wV2ny*C@ay3rs$JgL%@pbHP>=Jeb3u9Y+Bfc51Exrl=9j{}n8DpV- zd<(u6uV!o5#jq||5wFt2S2S$I2EW_q7lN>?zphx}Y+q*Qx}1I|#Dfk^=3mzq#}IIf zak7A~5SnEOA4o60`%lx}jPJo)*rn_;b~&>dCw(tAYDb22f((1{HqiVB@OIvqxB!dS zYonxXVWtTa8WkNQ`9Eivq_Rf8&ld!GNs}m7j2JmOIn$Cm?!58F3FWoZrq7sZpEGxX zV}J@!s?-{79E3arOBCZFfPv*fup@2VK`sCkKmw!9HGq^k*S=9YReWL+Kk+kIgWd!d z)vJfleFT`n8i3RXQpOm@rmBo->81=bl-b=Ea4^u~8jrIhzynP#(C&A4!TtmHFx3g_ z3osk%@qWuV*(C0mcCCg${>(Q+R;P<&_>w3lSJP$=>a{lz7v0{1YAjSxUHKOnz|NMnb_P-m}f zy zEzlV-Y7+Dw?wJq7eg45<1{NH7+t|~^4ZhJJP0Zf9`i5C&i*$!U{Jw_Sk#HAUG&Y^- z%p03$!&wWC9h{7~R({ikr9g{$EnscW+}*zUv(HirwObbs$VOl5Y}!MOXDJ8fREzXL5@iDCQv!_W42bCM$OqBlD&Rq{Mz;ayxdGjUHluC8b?yNW|2%pT zy$o#U+vp=erAN_890P2oj)yV8W~Kl-DFX(x1-MDDs$f4a2Tt-Dd>!zT>%j8d2JGYp z{185Xf56A_3H%#A1zckck&;-VBx<4szA=wXCo@SsX(SG^gmjaYBm_+2I^Ya@fi2tx z{NNkp6Y@E5gWr>%X)INWa!mlj3IEy$z6ZW&DqAbM^_}>^)(El!SmEr)JMlxP33iZw z3LBqC@KdOGGuX_>@GhA26ZlE6oL91|ScqNCuGx&A#=G$|cn^H;X4kUo*!AoN_`Hs{ zrKx{y^mv*2x0PTjP$}cDd*UnEgFU?C`8PEi0a*z(jgzI~j%e^+0d~AL06UT_v1-=Z z+URpRdkoH?A$MFsf#_G>#7P?meaT+@0gwrJAKs7OfdhOOzlRUv_u1dsI(9R=h26?- zW4E*QTL6e00zh&Yz7L=xWCa*<2f&XF@O=w>&*JxK6s%6tp?F(>-tqzk-6D z#Ha9S0TJHKo?=h4f3WA^7~l{zFhlzi--tQKi0_#LsD81xBs@_=3lj~VH4K$dgLvb+9`ftcbE7=1YgETM|Hj+z~BH+ z?ukDm?ZWd*Aj$mv5{aH9k2H_%ai{`ljKnOE!70C&a#|kV2Kmt5qgD(K`Y=Py(;Y|{y zCT_1(7OQCNat4>o1G=7DITaZ2Uipv^QG@nkm0Gb-pvL&jpfG~aNrFz12ko_rKDit! zjZlcC3ATN`qo7=;=#}>+B=sfgjbU$K++z@b_Ki+9PWJB+9*mRMiiJ{bnr-~_#(4|d zJAD_d3SEEOoxPj4?|gLEle=GfYu|@r9US&BDdKbF^N8SINGT};6H>jh%OhH*`tG2| z?S(`Jw?c@9mh&4uaG?EU9666?E&jv$4o#b*F*v2NIjGmTUtgWrjTY`~wd` z2)Yb>+Qnrsqym<&g$AiM4eHa5?28+57TF)A`?Kr|C@>AOf8!y!)QFnU9H5(=Xc^$Z z3jhQDfiaH4608A?HyWplQXfHE@ewT$eqft%a@SJZKjVQUC@F5W$beb1olvqK;ntmREy z0~kAp#!|rn-NNyqK9DH^pI7h(K)v|~(1zGl^bv-E2gs+o`B;tb3xaZ<8kphi@pT7V zhSoNAiLb(g3->5o!woy&ukdTmDfB0p1K8ye;7SH^w?aZx^zDX$qQU&;@cse|XF6B9 zJ0XsPa05;T{IAfu1Dqf~TiyWfBR7y6Aw+)#E67byq0S!Jj~Se|gWnYhdLm^4)MIao zc6A-O8M2&_cYWj*)HJjJLK|@>95cAZ!A9R~cgK>T1|1;l$sJ?^__{mEU1TFf4}GMc zY(m*E)R{c+!a*x29?_@>nwSlNKX_wKEQgkzps$0~9f0)sldWGI?sM=R`;hHthuNoW zA8T-Eav~;C^w!`E_*{?J;UId67*SD(x1g#0WGlIs+{fNw2ig0;CzA)rc39O8vXea6 zGB{e3Fb{Tsz02PFvl-G5aWfw!k2y4>;hYCuk{BZn?62G3M|Sa!=`ZUH%mEz(-FOh# z{RZ}4cv`!Kq7V3@A$D<}kia-Xo+Hnr8SD`Ih@t;-o|?Q&UO9UbppU%DFZgWv0Ujy6 zPX0;WXyIXsxKZpA_OV0L^8Y%yenz^QyiNAb2Ui~!y?(Nf><76YQUr(G0QMg)9>CnH zA>-O39`OP4E>PYPR5&1KgXcY5d;7?H{ObSX`l49A4_OQH0sAZx9+Hp95eS>fVe&B; z)lbR)kk8l=_Bj~SFW8sttIfztz93)1-`7G&_%-_m{{AI~gUJwvbbDNUFwy|g;B4-E zfCv1&3W4&R&uhtax7P(Qx^>XjoB*)1w;P!$$A662ni@}JnE#CBh&4N#Vcv$=vc;eN{iNE?t9 zT0$oy-4NeD|rI5vdktbX|AiE~}C0Dtvgzg9u*a6^JNhH#X4uL3fabnal$zq>UN z-m_M^a9CcIIw@SX>7{KUEIa4bhyUn;19j7-V%KnsthlJMrYgUz2(BJf6k7}Ps;eOj zR9#`MEGew4E-5Q4E~&^BY$NqjxZ%a$xDzeG5S{L$UA!GU>pM{z{Q|ZOFRhybjNC+* z(-m|j?FnH`2x~()E`&#Hq8HLtcsKo92#*ZmM0Q;Wr!w1I9-Qz{Xr6#i_+Kp|Y!PuD z&xu9gQUra4N4u0>$&Yp!y_~M4SA_7W5Y~lod?oL~bWZ$kKtD z1L?%A;0**bILkR0bI4Oijhj4NTr-bmB5H^JHSRf3AKcEW5KUfF<$5Vyk zADCc1_Y?s}F5vY7J`cyE(;4nIKOkZ8mR}ft^1R_$Yx86%*u>7R;q@)&s^3ss)d=r| zD}>?Bv(Hs|CR}}TL0mVyI`3D{J{Q=r;Z@H$ZF_fHKuFOHub*?SuHgz9PtEYAdU%U- z&OH)P4OwdVICoOGNz_Xp3}It0eK>^Ec%P^R91@XM!6k}1vI{5m(Z}f%A)FqWTn4`a3Px&!bLpWr-m-V(G{-%#>w-HuNs~1l)(-P501$(C6V^^e6Q9D0x(VR7;d6YE9H#Q4dAE8g)8a9c_xv zi7t#TjxLQJ7d<7qK6-w1YqTTU8U0%H{^-NeUq&B|J{El{MiwKFQO2la0x_#%E{VA! z=H{3UF?YrE#`MSB8*_im12H>d9*j98(MqOE=1S&ES|y7lZ4#H{e90QgC6db|Yb94o zu9jRYxn6Rkq+fEk3Hchsa@JEohxmT zE|79kzqCiXTDn$xmGm0vb<*w9UD6k%FG^pMz9QW#{Zjg!^ps2{Q^-^@t!#vBlq^G* zE6bA=$i~acWe%A~)+zJKx?~s0m~4&gQrYFQD`YpzZk63GyF+$Vc2agqc3SpFtRhw& ztBoBQtBXyDO^>z3=Emm77RDCGPKqs$t&FXXof2CcyE3*f_U+i;w+ZF2-cPQ>u+^2Xzu|x5Y;t|DTiWd}bDGn+=QhcoVMDexaTg7*Z9~8eU zv63odlu~7^a)eT+Oi=2TMrFRTOj)CxsjOGdQreX*$_2`W%0RcTd;s#H~)%Bm_MC`Oda8PwdWO1A-JqVW=F}Z(w|benQ|(iq zuMViY)hpCJ>Pyt?)eoy*Qh%gArarA1rLk#pG^LtOO+dpmS7~n5^lJJwcWds^Y}Gud zd06wPW|!s(%~P6pG~a9G+KJjK?R4!-ZL_vj>(Dy2?OIOj*Y;>H)Lx`rt-VaUR(qxP zYVEb!$FxV{v~l{l(Q%n^xpDb%g>l7k6XGVtmB&@aRmauEHN`EATdeczg1U=zSLtrl z-KN{Ddr0?)?lIltx+ir{>z>iQu6slGmTs?ZzwUtUJ>C1dKjPEk>*80&hvF}ezdC+h z{4Mc!$3GCiJ$`5W?)W$3-;UoG|4#h7@dx8Sh(8p6IR5+iALDu1*Xk zUYvMo;^m3ACvHo8HgSLAmx(9!Dt(eZO>feh^%i}$zDQr9FVml=pP-+lZ__W=uhl=G ze^URv{zd)E`d9U@>)+A8t3Rm!Kz~SoSbsbzHp!SYF==|zjHH=K^+`*URwP}Jv?^(J zQYh)-q^py*Cf%2`ZS?-phmvEGmC41)>yx)6Z%f{uyfgXH6BtMn>S&A%0mog@0 zY>F`@JtZS0Hzhx%Fr_%9G-X`Ml$2>HGg9hPW~JCu7NjgpSv2P3F<*^2KIY_@(_{WH zL>XcXQbVkv&@dMW(Kdt2&}r}*x(q#r%M9xc8w__DdJX-CyAAgkwi@m;Y%^>(Ja2f> z@Ur1m!|R4O3~w3s84ef@8a_09G!~Dwjh!`i!PqrpZy&p3?7p#|jy;}AQ{z)DsoAMT zspY9Psk2i%Qr)R*QrD$!N_{Hz&D3vFzfJuv^@r4BsXwRwV$3w=8_SI687CMg8Ox1z z#(74E(P?Zqa>f;`he&Ye-d&VQiFN|LqzcC(9qiNA;k~CRbAZ=CJ zjcK=}ZA*JJ?b)>F(q2eUP0va%NuQWLEqy^cmwtWvy7WEiucW`8{zm#+>4(z4PXAZ> z(e&@re@y=={kVymqD>N$%p^A{O=^?YlxLb?nr?EK&Np3by3}-|={D1P(+1OBrp=}; zru$3}n0A<6FdZ`eWICOJGDt>LMrKBJhCAcJjN3A{XFQkjX2!P}-(~!eaV+EKj1w6r zGfrpxVa8^?d9*pjY%r&q)66Dwj=9)eY941EZ=PtLZk}nbH_tLJG%qr@nO)}d%>i?_ zd4+k6`8x9r=9|px%sb7y&2O6DHt#dPV}93s(EOqKu=!K-G4pZrujW(c-!pM$RHh^| zHdB$AkeQ!Zow+E}n;FWyDYG~8-pp;8J2D^4d^q#z%srXUWxkMkB=g72W0^l?9=A|S zj74TqSX35`CDk(7GSgCLskh9sth9tIw^}w>9=1Gf*=^Zl`KRT8<)Gz5%VEo>mTxRa zEk9U}S&myyWZ^7%mM+VdbzW9g)|9MiSu?X5vgT#Y&uYzbWI40ivqD*SW<8tre%2RR zC$dgv{hp1pqp~I0vDwOOP4?JqV|IFWMs{X4+f$>nGOFtzTLHWj$&=X(P61o75(^#o0#L;%$1{Xq&}WVw+~O z+ZNe6Y)fsOwl3QW+Xc2&w$-+f?MmA%wp(qt+1A@O*zUD$v+b}wWP8-M%l548McXU3 z*KKduKCpdiJ7W9N_KodF+t0RNY`@t~<;3Kub4KSFb4)pzIoUY{ImJ1pIp^g}$f+0o RKOy2bJw1R22EKCU{x6gd?27;Z literal 0 HcmV?d00001 diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 0000000..d4334ae --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + CDTNetDemo.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/CDTNetDemo/CDTNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/DemucsDemo/DemucsDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/FOMMDemo/FOMMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/Face3DDemo/Face3DDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/MotionMagDemo/MotionMagDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/NAFNetDemo/NAFNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/RelightDemo/RelightDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/SimSwapDemo/SimSwapDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/Wav2LipDemo/Wav2LipDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} From 9b0df17e945e793d8e0c04e8c700c443e70dd696 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 29 Mar 2026 00:34:05 +0900 Subject: [PATCH 03/18] Add .DS_Store and xcuserdata to gitignore, remove tracked copies --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 4e76ca2..c4c6fc5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# macOS / Xcode +.DS_Store +xcuserdata/ +*.xcworkspace/ + # CoreML model files (download from Google Drive) *.mlpackage *.mlmodel From d7bf6f0501000d2a5c39b84e233ce63843db6bfa Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 29 Mar 2026 00:34:12 +0900 Subject: [PATCH 04/18] Remove tracked .DS_Store and Xcode user files --- creative_apps/.DS_Store | Bin 6148 -> 0 bytes creative_apps/CDTNetDemo/.DS_Store | Bin 6148 -> 0 bytes .../contents.xcworkspacedata | 7 ------- .../UserInterfaceState.xcuserstate | Bin 14194 -> 0 bytes .../xcschemes/xcschememanagement.plist | 14 -------------- 5 files changed, 21 deletions(-) delete mode 100644 creative_apps/.DS_Store delete mode 100644 creative_apps/CDTNetDemo/.DS_Store delete mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate delete mode 100644 creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist diff --git a/creative_apps/.DS_Store b/creative_apps/.DS_Store deleted file mode 100644 index 7ce5517cd73644cdfac19847b3583da131209ee5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}N6?5T3Nv?kYkL3VI88Em*A<#mmyxs|Q!~pi;YZ(Z%hibhq|U3VYWV@V&8lV#w`vl=RfNsYXdJ)w#ZLIIiP% zDm#{WCoal?=e8_N&9FlC|Lp7c|9ldUm;q+sPca}$ZNJ^bCE41#usEu<67?RHgyM38pDEbU hr5IzW6mO$yLA#_5qGvHTh!zyS2xuC(VFrGbfe*L$Orih) diff --git a/creative_apps/CDTNetDemo/.DS_Store b/creative_apps/CDTNetDemo/.DS_Store deleted file mode 100644 index a8620a297b6cdd674588d42a8a5d43ab6498a8d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKOG*SW5UtW#G`N|Ch`7oDM3@`IF>c+AYmu2A6q%+QQE-_fcmxmN5yY+6@>Qz* zH1=i?M6?Q0ukv~6PL`!O@9hm4h zHp$wz=?JSnr4y|DWBmtoM+@wuMr%jch^|%_li8weI^D?G<2u}?O=-Sr=ipS&`wx3N zPcMh-@#}B#yUWjOzt=6ecLiJlSHKncj|$+<7OS=t-Ma#=fGhB>fSeBjMKBs>#d37O zl@tId&*)-^C6^FRFpP#-5j`+gDp09xDF!PY?ZL)H!>p)uVoN^QzWkQFaQQms4`n!U zRCMnOxB`6zHnlpI`~Mif%wUq=r+Cd3a0ULD0z68yG{vUuZvC=7xoZQ;If|H!%M!p4 l_Z|VL$T_lGI<-GYjB(L0E6Oa=Ug$u72oyowxdK0+z$+6bKkonl diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata deleted file mode 100644 index 919434a..0000000 --- a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata +++ /dev/null @@ -1,7 +0,0 @@ - - - - - diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/project.xcworkspace/xcuserdata/majimadaisuke.xcuserdatad/UserInterfaceState.xcuserstate deleted file mode 100644 index 2202d2e2f8e5b4712b5c2b9203f8b2df9ee45f9d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14194 zcmch72Yi#&`u`bilQg?P+NQLGrcKkdN%ullCzR0^(iSL`kTx$RkR~-rfht~3aN%AT z7oxO91Qh`%F2w5|*8zy4BI-|^R~#s!fc~EMeVepJ@ZQgVT>2r&d)~9pGrrIFIc;4Y zcOaOXyB}dh5Jgca8pR+9ik+Lbobw0VKJUCVr@wuP8=e}|g1)Y~X}-3lTzfDOhwx<^ z9U66=eLB~}xylDBy^W+u)?^PlgB-)#wYV6`kpjh|1eAzMP$?=y=sERF4`^GipKe(L%HYxzSRz44scIMVFz=(OPr`x)NQ5u143O8_?XrT8*@IbMsez*pj{@C|r9z5{Q-cjCM7Mtm>6 z58sb>;D_<^_yznTehI&fU%{{9f8sar+xQ6n9Djko#9!gB@i+Ki_*;CGAc6@Ylthte z5nM>xA1*Da5WHIR=OGzj3 zlFP{DWG%UZTuH7XSCebVwd6W-J^4GimE1;dC!5JVWE**iJWL)TkCUg#v*aJ-Me-7P zjl4uCd>MQ77yI)~1u3+N)cn0C-5)I&Sz`7}rw4be69VtNU^p58!jq&LyO(=Bu> zy_eoc@2A`719UsxK_8`$(WmH(^d!|u(eLRG^dvpil;-XB zc=jPBQXw_cpb%aMwEuqkqKq6XcogHOv+>|mdTldDVd6?nTBb% zp-f~!StuJ>kq!RmqCAw33Q!>`VsR{y&0q~|7Ml(K?W~z`tef4)e9Y$1B+Ta8VWQP8 zchKh#PrGUMjDSOT;k`Pt|?BBhx0>!dhva^-|zHxz(l;kDz2@&qt?6F=g??jPL)1y zkXsoPmk8aA6sJ9Wwhm413}+x%>FezBa6!)1z?-i?5PB1rSK$nRmEzY6ib7s11h9wO5d*YBaSE)u1VC1k?oOx6=?-)+<8rEeE4&_` z(-p|6^!d4&GpsXxF3uCkX@^GTc#Ssn;*!GLXLI|wuU*o$VN z*~pF>TSWQxIyvad+PS5ZT9GVibjqq8}022lQv3Gaz0(0hI!|H@r z;6Ywg+>2`1h+gDlDa;mr;YSyu;w>nEf~XrUM=Q`u)PpWyW0--BWvR@_(pWk(Z9%Kh zMd)v6HDV}))}V`72D7j%md&gzhZV3wehcCQ19Dp%@&NAyyn}v zU_ZJEtwT*9knkWjx+T)cp-HZ-sI440zxsjx2Nz(R?Ae3(f8jmdfbK-a8#l6n!Fo|2 zSeeLhRc_FCpV#T{=|{b&A2q>D2Ft>I+>P$x4{S5b?L}Ky9>2e8Ze^F#3sy|n-zv~8 z-l5!&wrynj;Ix&}NF*poJo)4MEEkW*7tP+DDC zR${HJsmTKeRaI#%t*k1u7L-=il$2LhRhO1lhev-F{bTraUO+GL)0x6_z362&Zct`l zLvNwtP3U#>PxJ(FGJrMg2bv%5RIAc>f zSbZxP|2pt*_MjgGDXo}8Ofd?~y7+k|qPDPK{P-tg5Tf5=)`2oU9w&EOEGPbe#y*QUG zV78(Ah6`c!MYxzpGQ1oMqLiRo{|RdqccK)RHG}m8IMjzrBmNqnhbQozj%SWuJdrIL z-YIlhfh%#jpU!mF-iv24mjD<3t9|2)^IyAbW#kGMs zZh)2?Z#y;O3wzb{W~blnGRi3r1va~d>vb?CYy1J%G zIMoH=KJtrgc`fb+{>1Z2v-84R-Gl!IE(c$LFT|_xMQk}+!B(=KO?WkCIE2@*3)qEh z6}yN(S{*3Ba7f?i4)PnC@TV8eBAaoxSM^;yNPun4{Zp%fXaZi_yE{7pM~Kp9=Q{W> zCD0@ULCtw36?wV2ny*C@ay3rs$JgL%@pbHP>=Jeb3u9Y+Bfc51Exrl=9j{}n8DpV- zd<(u6uV!o5#jq||5wFt2S2S$I2EW_q7lN>?zphx}Y+q*Qx}1I|#Dfk^=3mzq#}IIf zak7A~5SnEOA4o60`%lx}jPJo)*rn_;b~&>dCw(tAYDb22f((1{HqiVB@OIvqxB!dS zYonxXVWtTa8WkNQ`9Eivq_Rf8&ld!GNs}m7j2JmOIn$Cm?!58F3FWoZrq7sZpEGxX zV}J@!s?-{79E3arOBCZFfPv*fup@2VK`sCkKmw!9HGq^k*S=9YReWL+Kk+kIgWd!d z)vJfleFT`n8i3RXQpOm@rmBo->81=bl-b=Ea4^u~8jrIhzynP#(C&A4!TtmHFx3g_ z3osk%@qWuV*(C0mcCCg${>(Q+R;P<&_>w3lSJP$=>a{lz7v0{1YAjSxUHKOnz|NMnb_P-m}f zy zEzlV-Y7+Dw?wJq7eg45<1{NH7+t|~^4ZhJJP0Zf9`i5C&i*$!U{Jw_Sk#HAUG&Y^- z%p03$!&wWC9h{7~R({ikr9g{$EnscW+}*zUv(HirwObbs$VOl5Y}!MOXDJ8fREzXL5@iDCQv!_W42bCM$OqBlD&Rq{Mz;ayxdGjUHluC8b?yNW|2%pT zy$o#U+vp=erAN_890P2oj)yV8W~Kl-DFX(x1-MDDs$f4a2Tt-Dd>!zT>%j8d2JGYp z{185Xf56A_3H%#A1zckck&;-VBx<4szA=wXCo@SsX(SG^gmjaYBm_+2I^Ya@fi2tx z{NNkp6Y@E5gWr>%X)INWa!mlj3IEy$z6ZW&DqAbM^_}>^)(El!SmEr)JMlxP33iZw z3LBqC@KdOGGuX_>@GhA26ZlE6oL91|ScqNCuGx&A#=G$|cn^H;X4kUo*!AoN_`Hs{ zrKx{y^mv*2x0PTjP$}cDd*UnEgFU?C`8PEi0a*z(jgzI~j%e^+0d~AL06UT_v1-=Z z+URpRdkoH?A$MFsf#_G>#7P?meaT+@0gwrJAKs7OfdhOOzlRUv_u1dsI(9R=h26?- zW4E*QTL6e00zh&Yz7L=xWCa*<2f&XF@O=w>&*JxK6s%6tp?F(>-tqzk-6D z#Ha9S0TJHKo?=h4f3WA^7~l{zFhlzi--tQKi0_#LsD81xBs@_=3lj~VH4K$dgLvb+9`ftcbE7=1YgETM|Hj+z~BH+ z?ukDm?ZWd*Aj$mv5{aH9k2H_%ai{`ljKnOE!70C&a#|kV2Kmt5qgD(K`Y=Py(;Y|{y zCT_1(7OQCNat4>o1G=7DITaZ2Uipv^QG@nkm0Gb-pvL&jpfG~aNrFz12ko_rKDit! zjZlcC3ATN`qo7=;=#}>+B=sfgjbU$K++z@b_Ki+9PWJB+9*mRMiiJ{bnr-~_#(4|d zJAD_d3SEEOoxPj4?|gLEle=GfYu|@r9US&BDdKbF^N8SINGT};6H>jh%OhH*`tG2| z?S(`Jw?c@9mh&4uaG?EU9666?E&jv$4o#b*F*v2NIjGmTUtgWrjTY`~wd` z2)Yb>+Qnrsqym<&g$AiM4eHa5?28+57TF)A`?Kr|C@>AOf8!y!)QFnU9H5(=Xc^$Z z3jhQDfiaH4608A?HyWplQXfHE@ewT$eqft%a@SJZKjVQUC@F5W$beb1olvqK;ntmREy z0~kAp#!|rn-NNyqK9DH^pI7h(K)v|~(1zGl^bv-E2gs+o`B;tb3xaZ<8kphi@pT7V zhSoNAiLb(g3->5o!woy&ukdTmDfB0p1K8ye;7SH^w?aZx^zDX$qQU&;@cse|XF6B9 zJ0XsPa05;T{IAfu1Dqf~TiyWfBR7y6Aw+)#E67byq0S!Jj~Se|gWnYhdLm^4)MIao zc6A-O8M2&_cYWj*)HJjJLK|@>95cAZ!A9R~cgK>T1|1;l$sJ?^__{mEU1TFf4}GMc zY(m*E)R{c+!a*x29?_@>nwSlNKX_wKEQgkzps$0~9f0)sldWGI?sM=R`;hHthuNoW zA8T-Eav~;C^w!`E_*{?J;UId67*SD(x1g#0WGlIs+{fNw2ig0;CzA)rc39O8vXea6 zGB{e3Fb{Tsz02PFvl-G5aWfw!k2y4>;hYCuk{BZn?62G3M|Sa!=`ZUH%mEz(-FOh# z{RZ}4cv`!Kq7V3@A$D<}kia-Xo+Hnr8SD`Ih@t;-o|?Q&UO9UbppU%DFZgWv0Ujy6 zPX0;WXyIXsxKZpA_OV0L^8Y%yenz^QyiNAb2Ui~!y?(Nf><76YQUr(G0QMg)9>CnH zA>-O39`OP4E>PYPR5&1KgXcY5d;7?H{ObSX`l49A4_OQH0sAZx9+Hp95eS>fVe&B; z)lbR)kk8l=_Bj~SFW8sttIfztz93)1-`7G&_%-_m{{AI~gUJwvbbDNUFwy|g;B4-E zfCv1&3W4&R&uhtax7P(Qx^>XjoB*)1w;P!$$A662ni@}JnE#CBh&4N#Vcv$=vc;eN{iNE?t9 zT0$oy-4NeD|rI5vdktbX|AiE~}C0Dtvgzg9u*a6^JNhH#X4uL3fabnal$zq>UN z-m_M^a9CcIIw@SX>7{KUEIa4bhyUn;19j7-V%KnsthlJMrYgUz2(BJf6k7}Ps;eOj zR9#`MEGew4E-5Q4E~&^BY$NqjxZ%a$xDzeG5S{L$UA!GU>pM{z{Q|ZOFRhybjNC+* z(-m|j?FnH`2x~()E`&#Hq8HLtcsKo92#*ZmM0Q;Wr!w1I9-Qz{Xr6#i_+Kp|Y!PuD z&xu9gQUra4N4u0>$&Yp!y_~M4SA_7W5Y~lod?oL~bWZ$kKtD z1L?%A;0**bILkR0bI4Oijhj4NTr-bmB5H^JHSRf3AKcEW5KUfF<$5Vyk zADCc1_Y?s}F5vY7J`cyE(;4nIKOkZ8mR}ft^1R_$Yx86%*u>7R;q@)&s^3ss)d=r| zD}>?Bv(Hs|CR}}TL0mVyI`3D{J{Q=r;Z@H$ZF_fHKuFOHub*?SuHgz9PtEYAdU%U- z&OH)P4OwdVICoOGNz_Xp3}It0eK>^Ec%P^R91@XM!6k}1vI{5m(Z}f%A)FqWTn4`a3Px&!bLpWr-m-V(G{-%#>w-HuNs~1l)(-P501$(C6V^^e6Q9D0x(VR7;d6YE9H#Q4dAE8g)8a9c_xv zi7t#TjxLQJ7d<7qK6-w1YqTTU8U0%H{^-NeUq&B|J{El{MiwKFQO2la0x_#%E{VA! z=H{3UF?YrE#`MSB8*_im12H>d9*j98(MqOE=1S&ES|y7lZ4#H{e90QgC6db|Yb94o zu9jRYxn6Rkq+fEk3Hchsa@JEohxmT zE|79kzqCiXTDn$xmGm0vb<*w9UD6k%FG^pMz9QW#{Zjg!^ps2{Q^-^@t!#vBlq^G* zE6bA=$i~acWe%A~)+zJKx?~s0m~4&gQrYFQD`YpzZk63GyF+$Vc2agqc3SpFtRhw& ztBoBQtBXyDO^>z3=Emm77RDCGPKqs$t&FXXof2CcyE3*f_U+i;w+ZF2-cPQ>u+^2Xzu|x5Y;t|DTiWd}bDGn+=QhcoVMDexaTg7*Z9~8eU zv63odlu~7^a)eT+Oi=2TMrFRTOj)CxsjOGdQreX*$_2`W%0RcTd;s#H~)%Bm_MC`Oda8PwdWO1A-JqVW=F}Z(w|benQ|(iq zuMViY)hpCJ>Pyt?)eoy*Qh%gArarA1rLk#pG^LtOO+dpmS7~n5^lJJwcWds^Y}Gud zd06wPW|!s(%~P6pG~a9G+KJjK?R4!-ZL_vj>(Dy2?OIOj*Y;>H)Lx`rt-VaUR(qxP zYVEb!$FxV{v~l{l(Q%n^xpDb%g>l7k6XGVtmB&@aRmauEHN`EATdeczg1U=zSLtrl z-KN{Ddr0?)?lIltx+ir{>z>iQu6slGmTs?ZzwUtUJ>C1dKjPEk>*80&hvF}ezdC+h z{4Mc!$3GCiJ$`5W?)W$3-;UoG|4#h7@dx8Sh(8p6IR5+iALDu1*Xk zUYvMo;^m3ACvHo8HgSLAmx(9!Dt(eZO>feh^%i}$zDQr9FVml=pP-+lZ__W=uhl=G ze^URv{zd)E`d9U@>)+A8t3Rm!Kz~SoSbsbzHp!SYF==|zjHH=K^+`*URwP}Jv?^(J zQYh)-q^py*Cf%2`ZS?-phmvEGmC41)>yx)6Z%f{uyfgXH6BtMn>S&A%0mog@0 zY>F`@JtZS0Hzhx%Fr_%9G-X`Ml$2>HGg9hPW~JCu7NjgpSv2P3F<*^2KIY_@(_{WH zL>XcXQbVkv&@dMW(Kdt2&}r}*x(q#r%M9xc8w__DdJX-CyAAgkwi@m;Y%^>(Ja2f> z@Ur1m!|R4O3~w3s84ef@8a_09G!~Dwjh!`i!PqrpZy&p3?7p#|jy;}AQ{z)DsoAMT zspY9Psk2i%Qr)R*QrD$!N_{Hz&D3vFzfJuv^@r4BsXwRwV$3w=8_SI687CMg8Ox1z z#(74E(P?Zqa>f;`he&Ye-d&VQiFN|LqzcC(9qiNA;k~CRbAZ=CJ zjcK=}ZA*JJ?b)>F(q2eUP0va%NuQWLEqy^cmwtWvy7WEiucW`8{zm#+>4(z4PXAZ> z(e&@re@y=={kVymqD>N$%p^A{O=^?YlxLb?nr?EK&Np3by3}-|={D1P(+1OBrp=}; zru$3}n0A<6FdZ`eWICOJGDt>LMrKBJhCAcJjN3A{XFQkjX2!P}-(~!eaV+EKj1w6r zGfrpxVa8^?d9*pjY%r&q)66Dwj=9)eY941EZ=PtLZk}nbH_tLJG%qr@nO)}d%>i?_ zd4+k6`8x9r=9|px%sb7y&2O6DHt#dPV}93s(EOqKu=!K-G4pZrujW(c-!pM$RHh^| zHdB$AkeQ!Zow+E}n;FWyDYG~8-pp;8J2D^4d^q#z%srXUWxkMkB=g72W0^l?9=A|S zj74TqSX35`CDk(7GSgCLskh9sth9tIw^}w>9=1Gf*=^Zl`KRT8<)Gz5%VEo>mTxRa zEk9U}S&myyWZ^7%mM+VdbzW9g)|9MiSu?X5vgT#Y&uYzbWI40ivqD*SW<8tre%2RR zC$dgv{hp1pqp~I0vDwOOP4?JqV|IFWMs{X4+f$>nGOFtzTLHWj$&=X(P61o75(^#o0#L;%$1{Xq&}WVw+~O z+ZNe6Y)fsOwl3QW+Xc2&w$-+f?MmA%wp(qt+1A@O*zUD$v+b}wWP8-M%l548McXU3 z*KKduKCpdiJ7W9N_KodF+t0RNY`@t~<;3Kub4KSFb4)pzIoUY{ImJ1pIp^g}$f+0o RKOy2bJw1R22EKCU{x6gd?27;Z diff --git a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist b/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist deleted file mode 100644 index d4334ae..0000000 --- a/creative_apps/CDTNetDemo/CDTNetDemo.xcodeproj/xcuserdata/majimadaisuke.xcuserdatad/xcschemes/xcschememanagement.plist +++ /dev/null @@ -1,14 +0,0 @@ - - - - - SchemeUserState - - CDTNetDemo.xcscheme_^#shared#^_ - - orderHint - 0 - - - - From f958011fa37edab71bf7cb0f8b11b9064e69d9f6 Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Sun, 29 Mar 2026 08:45:47 +0900 Subject: [PATCH 05/18] Add 10 new CoreML models with sample apps and conversion scripts New models: Depth Anything V2, YOLOv10-N, BiRefNet, Whisper Tiny, Depth Pro, Kokoro-82M TTS, SmolVLM2-500M, YOLOE-S, DWPose, PP-OCRv5. Covers new categories: speech recognition, TTS, VLM, open-vocab detection, pose estimation, and multilingual OCR. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 134 +++ conversion_scripts/convert_birefnet.py | 20 + .../convert_depth_anything_v2.py | 24 + conversion_scripts/convert_depth_pro.py | 28 + conversion_scripts/convert_dwpose.py | 25 + conversion_scripts/convert_kokoro.py | 29 + conversion_scripts/convert_ppocr_v5.py | 34 + conversion_scripts/convert_smolvlm2.py | 35 + conversion_scripts/convert_whisper.py | 36 + conversion_scripts/convert_yoloe.py | 15 + conversion_scripts/convert_yolov10.py | 26 + .../BiRefNetDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../BiRefNetDemo/BiRefNetDemoApp.swift | 10 + .../BiRefNetDemo/ContentView.swift | 743 ++++++++++++ .../BiRefNetDemo/BiRefNetDemo/Info.plist | 8 + .../DepthProDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../DepthProDemo/ContentView.swift | 864 ++++++++++++++ .../DepthProDemo/DepthProDemoApp.swift | 10 + .../DepthProDemo/DepthProDemo/Info.plist | 8 + .../KokoroDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../KokoroDemo/Assets.xcassets/Contents.json | 6 + .../KokoroDemo/KokoroDemo/ContentView.swift | 958 +++++++++++++++ .../KokoroDemo/KokoroDemo/Info.plist | 5 + .../KokoroDemo/KokoroDemo/KokoroDemoApp.swift | 10 + .../PPOCRv5Demo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../PPOCRv5Demo/Assets.xcassets/Contents.json | 6 + .../PPOCRv5Demo/PPOCRv5Demo/ContentView.swift | 1036 +++++++++++++++++ .../PPOCRv5Demo/PPOCRv5Demo/Info.plist | 10 + .../PPOCRv5Demo/PPOCRv5DemoApp.swift | 10 + .../SmolVLMDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../SmolVLMDemo/Assets.xcassets/Contents.json | 6 + .../SmolVLMDemo/SmolVLMDemo/ContentView.swift | 804 +++++++++++++ .../SmolVLMDemo/SmolVLMDemo/Info.plist | 10 + .../SmolVLMDemo/SmolVLMDemoApp.swift | 10 + .../WhisperDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../WhisperDemo/Assets.xcassets/Contents.json | 6 + .../WhisperDemo/WhisperDemo/ContentView.swift | 830 +++++++++++++ .../WhisperDemo/WhisperDemo/Info.plist | 8 + .../WhisperDemo/WhisperDemoApp.swift | 10 + .../YOLOEDemo.xcodeproj/project.pbxproj | 270 +++++ .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../YOLOEDemo/Assets.xcassets/Contents.json | 6 + .../YOLOEDemo/YOLOEDemo/ContentView.swift | 982 ++++++++++++++++ creative_apps/YOLOEDemo/YOLOEDemo/Info.plist | 10 + .../YOLOEDemo/YOLOEDemo/YOLOEDemoApp.swift | 10 + .../DWPoseDemo.xcodeproj/project.pbxproj | 340 ++++++ .../AccentColor.colorset/Contents.json | 11 + .../DWPoseDemo/Assets.xcassets/Contents.json | 6 + .../DWPoseDemo/DWPoseDemo/ContentView.swift | 659 +++++++++++ .../DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift | 10 + sample_apps/DWPoseDemo/DWPoseDemo/Info.plist | 8 + .../project.pbxproj | 340 ++++++ .../AccentColor.colorset/Contents.json | 11 + .../Assets.xcassets/Contents.json | 6 + .../DepthAnythingV2Demo/ContentView.swift | 438 +++++++ .../DepthAnythingV2DemoApp.swift | 10 + .../DepthAnythingV2Demo/Info.plist | 8 + .../YOLOv10Demo.xcodeproj/project.pbxproj | 340 ++++++ .../AccentColor.colorset/Contents.json | 11 + .../YOLOv10Demo/Assets.xcassets/Contents.json | 6 + .../YOLOv10Demo/YOLOv10Demo/ContentView.swift | 452 +++++++ .../YOLOv10Demo/YOLOv10Demo/Info.plist | 8 + .../YOLOv10Demo/YOLOv10DemoApp.swift | 10 + 78 files changed, 11526 insertions(+) create mode 100644 conversion_scripts/convert_birefnet.py create mode 100644 conversion_scripts/convert_depth_anything_v2.py create mode 100644 conversion_scripts/convert_depth_pro.py create mode 100644 conversion_scripts/convert_dwpose.py create mode 100644 conversion_scripts/convert_kokoro.py create mode 100644 conversion_scripts/convert_ppocr_v5.py create mode 100644 conversion_scripts/convert_smolvlm2.py create mode 100644 conversion_scripts/convert_whisper.py create mode 100644 conversion_scripts/convert_yoloe.py create mode 100644 conversion_scripts/convert_yolov10.py create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/BiRefNetDemoApp.swift create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift create mode 100644 creative_apps/BiRefNetDemo/BiRefNetDemo/Info.plist create mode 100644 creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/DepthProDemo/DepthProDemo/ContentView.swift create mode 100644 creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift create mode 100644 creative_apps/DepthProDemo/DepthProDemo/Info.plist create mode 100644 creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/KokoroDemo/KokoroDemo/ContentView.swift create mode 100644 creative_apps/KokoroDemo/KokoroDemo/Info.plist create mode 100644 creative_apps/KokoroDemo/KokoroDemo/KokoroDemoApp.swift create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo.xcodeproj/project.pbxproj create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/Contents.json create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/ContentView.swift create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/Info.plist create mode 100644 creative_apps/PPOCRv5Demo/PPOCRv5Demo/PPOCRv5DemoApp.swift create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/ContentView.swift create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/Info.plist create mode 100644 creative_apps/SmolVLMDemo/SmolVLMDemo/SmolVLMDemoApp.swift create mode 100644 creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/WhisperDemo/WhisperDemo/ContentView.swift create mode 100644 creative_apps/WhisperDemo/WhisperDemo/Info.plist create mode 100644 creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/Contents.json create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/ContentView.swift create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/Info.plist create mode 100644 creative_apps/YOLOEDemo/YOLOEDemo/YOLOEDemoApp.swift create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift create mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Info.plist create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift create mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/Contents.json create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo/Info.plist create mode 100644 sample_apps/YOLOv10Demo/YOLOv10Demo/YOLOv10DemoApp.swift diff --git a/README.md b/README.md index 1f542f3..a666bc9 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,34 @@ You are free to do or not. - [**Image Deblurring**](#image-deblurring) **:NEW** - [NAFNet](#nafnet) +- [**Monocular Depth Estimation (Next-Gen)**](#monocular-depth-estimation-next-gen) **:NEW** + - [Depth Anything V2 Small](#depth-anything-v2-small) + - [Depth Pro](#depth-pro) + +- [**Object Detection (Next-Gen)**](#object-detection-next-gen) **:NEW** + - [YOLOv10-N](#yolov10-n) + +- [**Background Removal (SOTA)**](#background-removal-sota) **:NEW** + - [BiRefNet](#birefnet) + +- [**Speech Recognition**](#speech-recognition) **:NEW** + - [Whisper Tiny](#whisper-tiny) + +- [**Text-to-Speech**](#text-to-speech) **:NEW** + - [Kokoro-82M](#kokoro-82m) + +- [**Vision-Language Model**](#vision-language-model) **:NEW** + - [SmolVLM2-500M](#smolvlm2-500m) + +- [**Open-Vocabulary Detection**](#open-vocabulary-detection) **:NEW** + - [YOLOE-S](#yoloe-s) + +- [**Pose Estimation**](#pose-estimation) **:NEW** + - [DWPose / RTMPose](#dwpose--rtmpose) + +- [**Multilingual OCR**](#multilingual-ocr) **:NEW** + - [PP-OCRv5](#pp-ocrv5) + # How to get the model You can get the model converted to CoreML format from the link of Google drive. See the section below for how to use it in Xcode. @@ -1058,6 +1086,112 @@ Nonlinear Activation Free Network. State-of-the-art image deblurring without non | [NAFNet_Deblur (TBD)] | 130 MB | 256x256 blurry image | 256x256 deblurred image | [megvii-research/NAFNet](https://github.com/megvii-research/NAFNet) | MIT | 2022 | [NAFNetDemo](creative_apps/NAFNetDemo) | +# Monocular Depth Estimation (Next-Gen) + +### Depth Anything V2 Small + +Depth Anything V2 (TsingHua, 2024). State-of-the-art monocular depth estimation. Massively improved over MiDaS with synthetic data training. The Small variant is extremely lightweight (~25 MB). + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [DepthAnythingV2Small (TBD)] | 25 MB | 518x518 image | 518x518 relative depth map | [DepthAnything/Depth-Anything-V2](https://github.com/DepthAnything/Depth-Anything-V2) | [Apache 2.0](https://github.com/DepthAnything/Depth-Anything-V2/blob/main/LICENSE) | 2024 | [DepthAnythingV2Demo](sample_apps/DepthAnythingV2Demo) | + +### Depth Pro + +Apple Depth Pro (Apple, 2024). Metric depth estimation from a single image. Predicts absolute distance in meters with estimated focal length. Ideal for AR applications. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [DepthPro (TBD)] | 150 MB | 1536x1536 image | metric depth map (meters) + focal length | [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) | [Apple Sample Code License](https://github.com/apple/ml-depth-pro/blob/main/LICENSE) | 2024 | [DepthProDemo](creative_apps/DepthProDemo) | + +# Object Detection (Next-Gen) + +### YOLOv10-N + +YOLOv10 Nano (Tsinghua, 2024). NMS-free real-time object detection. Consistent dual assignments for training eliminates the need for Non-Maximum Suppression, reducing latency. Nano variant is only ~8 MB. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [YOLOv10N (TBD)] | 8 MB | 640x640 image | bounding boxes + class scores (80 COCO classes) | [THU-MIG/yolov10](https://github.com/THU-MIG/yolov10) | [AGPL-3.0](https://github.com/THU-MIG/yolov10/blob/main/LICENSE) | 2024 | [YOLOv10Demo](sample_apps/YOLOv10Demo) | + +# Background Removal (SOTA) + +### BiRefNet + +Bilateral Reference Network (2024). State-of-the-art dichotomous image segmentation for high-quality background removal. Excels at fine details like hair, fur, and transparent objects. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [BiRefNet (TBD)] | 80 MB | 1024x1024 image | 1024x1024 alpha mask | [ZhengPeng7/BiRefNet](https://github.com/ZhengPeng7/BiRefNet) | [MIT](https://github.com/ZhengPeng7/BiRefNet/blob/main/LICENSE) | 2024 | [BiRefNetDemo](creative_apps/BiRefNetDemo) | + +# Speech Recognition + +### Whisper Tiny + +OpenAI Whisper Tiny (OpenAI, 2023). Multilingual speech-to-text model supporting 99+ languages. The Tiny variant (~75 MB) is ideal for on-device transcription. Apple provides official CoreML conversion via WhisperKit. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [WhisperTinyEncoder (TBD)] | 75 MB | mel spectrogram (1,80,3000) | encoder hidden states | [openai/whisper](https://github.com/openai/whisper) | [MIT](https://github.com/openai/whisper/blob/main/LICENSE) | 2023 | [WhisperDemo](creative_apps/WhisperDemo) | + +Note: For production use, consider [WhisperKit](https://github.com/argmaxinc/WhisperKit) which provides optimized CoreML models with full encoder+decoder pipeline. + +# Text-to-Speech + +### Kokoro-82M + +Kokoro-82M (2025). #1 on TTS Arena. Ultra-lightweight text-to-speech model with only 82M parameters, supporting 54 voices across 8 languages (EN, JP, FR, ES, IT, PT, HI, ZH). Runs 3.3x real-time on iPhone 13 Pro. CoreML conversion and iOS Swift package already available. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [Kokoro82M (TBD)] | 80 MB (quantized) | phoneme tokens + voice style | 24kHz audio waveform | [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) | [Apache 2.0](https://huggingface.co/hexgrad/Kokoro-82M) | 2025 | [KokoroDemo](creative_apps/KokoroDemo) | + +Note: Pre-converted CoreML model available at [FluidInference/kokoro-82m-coreml](https://huggingface.co/FluidInference/kokoro-82m-coreml). iOS Swift package at [mlalma/kokoro-ios](https://github.com/mlalma/kokoro-ios). + +# Vision-Language Model + +### SmolVLM2-500M + +SmolVLM2-500M (HuggingFace, 2025). The world's smallest video-language model. Describe images, answer visual questions, read text (OCR), and understand video — all on-device. Only 500M parameters, runs on iPhone via MLX Swift. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [SmolVLM2_VisionEncoder (TBD)] | 245 MB (Q8) | 384x384 image + text tokens | text response | [HuggingFaceTB/SmolVLM2-500M-Video-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) | [Apache 2.0](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) | 2025 | [SmolVLMDemo](creative_apps/SmolVLMDemo) | + +Note: GGUF models for llama.cpp available at [ggml-org/SmolVLM2-500M-Video-Instruct-GGUF](https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF). + +# Open-Vocabulary Detection + +### YOLOE-S + +YOLOE-S (Tsinghua, ICCV 2025). Real-time open-vocabulary object detection and segmentation. Detect any object by text description, visual reference, or in prompt-free mode. +3.5 AP over YOLO-World with 1.4x faster inference. Zero overhead compared to closed-set YOLOs. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [YOLOE_S (TBD)] | 50 MB | 640x640 image + text prompt | bounding boxes + segmentation masks | [THU-MIG/yoloe](https://github.com/THU-MIG/yoloe) | [AGPL-3.0](https://github.com/THU-MIG/yoloe/blob/main/LICENSE) | 2025 | [YOLOEDemo](creative_apps/YOLOEDemo) | + +# Pose Estimation + +### DWPose / RTMPose + +DWPose + RTMPose (2023-2025). Real-time whole-body pose estimation with 133 keypoints (body, hands, face, feet). DWPose uses distillation from larger models for excellent accuracy in a compact package. 70+ FPS on mobile. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [DWPose (TBD)] | 15-54 MB | 256x192 image | 17-133 keypoint heatmaps (SimCC) | [IDEA-Research/DWPose](https://github.com/IDEA-Research/DWPose) | [Apache 2.0](https://github.com/open-mmlab/mmpose/blob/main/LICENSE) | 2023 | [DWPoseDemo](sample_apps/DWPoseDemo) | + +# Multilingual OCR + +### PP-OCRv5 + +PP-OCRv5 (Baidu, 2025). Ultra-lightweight multilingual OCR supporting 100+ languages. Two-stage pipeline: text detection + text recognition. Total model size under 20 MB. Handles scene text, handwriting, documents, and more. + +| Model | Size | Input | Output | Original Project | License | Year | Sample Project | +| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | +| [PPOCRv5_Det (TBD)] | 10 MB | 640x640 image | text region heatmap | [PaddlePaddle/PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) | [Apache 2.0](https://github.com/PaddlePaddle/PaddleOCR/blob/main/LICENSE) | 2025 | [PPOCRv5Demo](creative_apps/PPOCRv5Demo) | +| [PPOCRv5_Rec (TBD)] | 10 MB | 48x320 text crop | character sequence | | | | | + + # Thanks Cover image was taken from Ghibli free images. diff --git a/conversion_scripts/convert_birefnet.py b/conversion_scripts/convert_birefnet.py new file mode 100644 index 0000000..d212686 --- /dev/null +++ b/conversion_scripts/convert_birefnet.py @@ -0,0 +1,20 @@ +# BiRefNet -> CoreML conversion +# pip install torch torchvision coremltools transformers +import torch +import coremltools as ct +from transformers import AutoModelForImageSegmentation + +model = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True) +model.eval() + +dummy = torch.randn(1, 3, 1024, 1024) +traced = torch.jit.trace(model, dummy) + +mlmodel = ct.convert( + traced, + inputs=[ct.ImageType(name="image", shape=(1, 3, 1024, 1024), scale=1/255.0)], + outputs=[ct.TensorType(name="mask")], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +mlmodel.save("BiRefNet.mlpackage") diff --git a/conversion_scripts/convert_depth_anything_v2.py b/conversion_scripts/convert_depth_anything_v2.py new file mode 100644 index 0000000..d2b3205 --- /dev/null +++ b/conversion_scripts/convert_depth_anything_v2.py @@ -0,0 +1,24 @@ +# Depth Anything V2 Small -> CoreML conversion +# pip install torch torchvision coremltools transformers +import torch +import coremltools as ct +from transformers import AutoModelForDepthEstimation, AutoImageProcessor + +model_name = "depth-anything/Depth-Anything-V2-Small-hf" +model = AutoModelForDepthEstimation.from_pretrained(model_name) +model.eval() + +# Trace +dummy = torch.randn(1, 3, 518, 518) +traced = torch.jit.trace(model, dummy) + +# Convert +mlmodel = ct.convert( + traced, + inputs=[ct.ImageType(name="image", shape=(1, 3, 518, 518), scale=1/255.0, bias=[0, 0, 0])], + outputs=[ct.TensorType(name="depth")], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +mlmodel.save("DepthAnythingV2Small.mlpackage") +print("Saved DepthAnythingV2Small.mlpackage") diff --git a/conversion_scripts/convert_depth_pro.py b/conversion_scripts/convert_depth_pro.py new file mode 100644 index 0000000..c633f33 --- /dev/null +++ b/conversion_scripts/convert_depth_pro.py @@ -0,0 +1,28 @@ +# Depth Pro -> CoreML conversion +# Apple's official repo: https://github.com/apple/ml-depth-pro +# pip install depth-pro + +import torch +import coremltools as ct +import depth_pro + +# Load model +model, transform = depth_pro.create_model_and_transforms() +model.eval() + +# Trace with dummy input +dummy = torch.randn(1, 3, 1536, 1536) + +# Note: Depth Pro outputs both depth map and focal length +# For CoreML, we trace the model and convert +traced = torch.jit.trace(model, dummy) + +mlmodel = ct.convert( + traced, + inputs=[ct.ImageType(name="image", shape=(1, 3, 1536, 1536), scale=1/255.0)], + outputs=[ct.TensorType(name="depth"), ct.TensorType(name="focallength")], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +mlmodel.save("DepthPro.mlpackage") +print("Saved DepthPro.mlpackage") diff --git a/conversion_scripts/convert_dwpose.py b/conversion_scripts/convert_dwpose.py new file mode 100644 index 0000000..77ab7ec --- /dev/null +++ b/conversion_scripts/convert_dwpose.py @@ -0,0 +1,25 @@ +# DWPose / RTMPose -> CoreML conversion +# DWPose uses RTMPose as backbone with distillation +# pip install torch coremltools onnx onnxruntime + +import coremltools as ct +import onnx + +# Download RTMPose ONNX model from: +# https://github.com/open-mmlab/mmpose/tree/main/projects/rtmpose +# rtmpose-m_simcc-body7_pt-body7_420e-256x192.onnx + +# For whole-body (133 keypoints): +# dwpose: rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288.onnx + +onnx_path = "rtmpose-m_simcc-body7_pt-body7_420e-256x192.onnx" +onnx_model = onnx.load(onnx_path) + +mlmodel = ct.converters.convert( + onnx_model, + inputs=[ct.ImageType(name="image", shape=(1, 3, 256, 192), scale=1/255.0)], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +mlmodel.save("DWPose.mlpackage") +print("Saved DWPose.mlpackage") diff --git a/conversion_scripts/convert_kokoro.py b/conversion_scripts/convert_kokoro.py new file mode 100644 index 0000000..19a600b --- /dev/null +++ b/conversion_scripts/convert_kokoro.py @@ -0,0 +1,29 @@ +# Kokoro-82M -> CoreML conversion +# Pre-converted CoreML model available at: https://huggingface.co/FluidInference/kokoro-82m-coreml +# iOS Swift package: https://github.com/mlalma/kokoro-ios +# +# Manual conversion: +# pip install torch coremltools kokoro + +import torch +import coremltools as ct + +# Kokoro has a two-stage pipeline: Duration Predictor + Decoder +# The model uses StyleTTS2-based architecture with ISTFTNet decoder + +# Download from HuggingFace +from huggingface_hub import hf_hub_download +import json + +# Load the model +repo_id = "hexgrad/Kokoro-82M" +model_path = hf_hub_download(repo_id, "kokoro-v1.0.onnx") + +# Convert from ONNX to CoreML +mlmodel = ct.converters.convert( + model_path, + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +mlmodel.save("Kokoro82M.mlpackage") +print("Saved Kokoro82M.mlpackage") diff --git a/conversion_scripts/convert_ppocr_v5.py b/conversion_scripts/convert_ppocr_v5.py new file mode 100644 index 0000000..fff08fb --- /dev/null +++ b/conversion_scripts/convert_ppocr_v5.py @@ -0,0 +1,34 @@ +# PP-OCRv5 -> CoreML conversion +# PP-OCRv5 by Baidu PaddlePaddle - Ultra lightweight multilingual OCR +# https://github.com/PaddlePaddle/PaddleOCR +# pip install paddlepaddle paddleocr torch coremltools onnx + +# Step 1: Export PaddleOCR to ONNX using paddle2onnx +# pip install paddle2onnx +# paddle2onnx --model_dir ./PP-OCRv5_det --model_filename inference.pdmodel \ +# --params_filename inference.pdiparams --save_file ppocrv5_det.onnx + +# Step 2: Convert ONNX to CoreML +import coremltools as ct +import onnx + +# Detection model +det_onnx = onnx.load("ppocrv5_det.onnx") +det_ml = ct.converters.convert( + det_onnx, + inputs=[ct.ImageType(name="image", shape=(1, 3, 640, 640), scale=1/255.0)], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +det_ml.save("PPOCRv5_Det.mlpackage") + +# Recognition model +rec_onnx = onnx.load("ppocrv5_rec.onnx") +rec_ml = ct.converters.convert( + rec_onnx, + inputs=[ct.TensorType(name="image", shape=(1, 3, 48, 320))], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +rec_ml.save("PPOCRv5_Rec.mlpackage") +print("Saved PPOCRv5_Det.mlpackage and PPOCRv5_Rec.mlpackage") diff --git a/conversion_scripts/convert_smolvlm2.py b/conversion_scripts/convert_smolvlm2.py new file mode 100644 index 0000000..996e435 --- /dev/null +++ b/conversion_scripts/convert_smolvlm2.py @@ -0,0 +1,35 @@ +# SmolVLM2-500M -> CoreML conversion +# pip install torch coremltools transformers accelerate + +import torch +import coremltools as ct +from transformers import AutoProcessor, AutoModelForVision2Seq + +model_name = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" +processor = AutoProcessor.from_pretrained(model_name) +model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32) +model.eval() + +# Note: VLM conversion to CoreML is complex due to autoregressive generation. +# For production use, consider: +# 1. Export vision encoder separately +# 2. Export language model separately +# 3. Use MLX Swift for on-device inference (proven to work on iPhone) +# +# Vision Encoder conversion: +vision_encoder = model.model.vision_model +dummy_pixel = torch.randn(1, 3, 384, 384) +traced_vision = torch.jit.trace(vision_encoder, dummy_pixel) + +vision_ml = ct.convert( + traced_vision, + inputs=[ct.ImageType(name="pixel_values", shape=(1, 3, 384, 384), scale=1/255.0)], + outputs=[ct.TensorType(name="image_features")], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +vision_ml.save("SmolVLM2_VisionEncoder.mlpackage") +print("Saved SmolVLM2_VisionEncoder.mlpackage") + +# For the full model, consider using GGUF format with llama.cpp or MLX Swift +# GGUF models available at: https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF diff --git a/conversion_scripts/convert_whisper.py b/conversion_scripts/convert_whisper.py new file mode 100644 index 0000000..a0fb97c --- /dev/null +++ b/conversion_scripts/convert_whisper.py @@ -0,0 +1,36 @@ +# Whisper Tiny -> CoreML conversion +# Apple provides official conversion via whisperkittools +# pip install whisperkittools +# Alternatively, use huggingface optimum: +# pip install optimum[exporters] + +# Method 1: Using whisperkit (recommended) +# python -m whisperkittools.generate_model --model openai/whisper-tiny --output-dir . + +# Method 2: Manual conversion +import torch +import coremltools as ct +from transformers import WhisperForConditionalGeneration, WhisperProcessor + +model_name = "openai/whisper-tiny" +model = WhisperForConditionalGeneration.from_pretrained(model_name) +processor = WhisperProcessor.from_pretrained(model_name) +model.eval() + +# Convert encoder +encoder = model.get_encoder() +mel_input = torch.randn(1, 80, 3000) +traced_encoder = torch.jit.trace(encoder, mel_input) + +encoder_ml = ct.convert( + traced_encoder, + inputs=[ct.TensorType(name="mel_input", shape=(1, 80, 3000))], + outputs=[ct.TensorType(name="encoder_output")], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", +) +encoder_ml.save("WhisperTinyEncoder.mlpackage") +print("Saved WhisperTinyEncoder.mlpackage") + +# Note: Decoder conversion requires more complex handling for autoregressive generation. +# For production use, consider using WhisperKit or Apple's pre-converted models. diff --git a/conversion_scripts/convert_yoloe.py b/conversion_scripts/convert_yoloe.py new file mode 100644 index 0000000..e8e03de --- /dev/null +++ b/conversion_scripts/convert_yoloe.py @@ -0,0 +1,15 @@ +# YOLOE-S -> CoreML conversion +# YOLOE: Real-Time Seeing Anything (ICCV 2025) +# https://github.com/THU-MIG/yoloe +# pip install ultralytics + +from ultralytics import YOLO + +# YOLOE-S with text prompt capability +model = YOLO("yoloe-11s-seg.pt") +model.export(format="coreml", imgsz=640, half=True) +print("Exported YOLOE-S to CoreML format") + +# Alternative: Export with ONNX first then convert +# model.export(format="onnx", imgsz=640) +# Then use coremltools to convert ONNX -> CoreML diff --git a/conversion_scripts/convert_yolov10.py b/conversion_scripts/convert_yolov10.py new file mode 100644 index 0000000..f32517c --- /dev/null +++ b/conversion_scripts/convert_yolov10.py @@ -0,0 +1,26 @@ +""" +Convert YOLOv10-N (Nano) to CoreML format. + +Requirements: + pip install ultralytics + +The exported model will be saved alongside the .pt weights as +yolov10n.mlpackage. Drag it into the Xcode project so the compiler +produces the bundled .mlmodelc at build time. + +Usage: + python convert_yolov10.py +""" + +from ultralytics import YOLO + +# Download (if needed) and load the pretrained YOLOv10-N weights +model = YOLO("yolov10n.pt") + +# Export to CoreML +# - imgsz : input resolution expected by the model +# - half : use float16 for smaller model size on device +# - nms : disable built-in NMS (YOLOv10 is NMS-free by design) +model.export(format="coreml", imgsz=640, half=True, nms=False) + +print("CoreML conversion complete. Look for yolov10n.mlpackage") diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..2f2013f --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + BR0001 /* BiRefNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0002; }; + BR0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0004; }; + BR0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = BR0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + BR0007 /* BiRefNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = BiRefNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + BR0002 /* BiRefNetDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BiRefNetDemoApp.swift; sourceTree = ""; }; + BR0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + BR0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + BR0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + BR0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + BR0010 = { + isa = PBXGroup; + children = ( + BR0011 /* BiRefNetDemo */, + BR0012 /* Products */, + ); + sourceTree = ""; + }; + BR0011 /* BiRefNetDemo */ = { + isa = PBXGroup; + children = ( + BR0002 /* BiRefNetDemoApp.swift */, + BR0004 /* ContentView.swift */, + BR0006 /* Assets.xcassets */, + BR0008 /* Info.plist */, + ); + path = BiRefNetDemo; + sourceTree = ""; + }; + BR0012 /* Products */ = { + isa = PBXGroup; + children = ( + BR0007 /* BiRefNetDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + BR0013 /* BiRefNetDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = BR0014; + buildPhases = ( + BR0015 /* Sources */, + BR0009 /* Frameworks */, + BR0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = BiRefNetDemo; + productName = BiRefNetDemo; + productReference = BR0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + BR0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + BR0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = BR0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = BR0010; + productRefGroup = BR0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + BR0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + BR0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BR0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + BR0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BR0001 /* BiRefNetDemoApp.swift in Sources */, + BR0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + BR0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + BR0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + BR0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = BiRefNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.birefnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + BR0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = BiRefNetDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.birefnetdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + BR0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BR0019, + BR0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + BR0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BR0021, + BR0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = BR0017; +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/Contents.json b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/BiRefNetDemoApp.swift b/creative_apps/BiRefNetDemo/BiRefNetDemo/BiRefNetDemoApp.swift new file mode 100644 index 0000000..69b7f97 --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/BiRefNetDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct BiRefNetDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift new file mode 100644 index 0000000..e5cc6ca --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift @@ -0,0 +1,743 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI +import Photos + +// MARK: - Background Removal using BiRefNet +// BiRefNet is a bilateral reference network for high-resolution dichotomous image segmentation. +// It takes an input image and produces a precise foreground mask, enabling clean background removal. + +struct ContentView: View { + @StateObject private var viewModel = BackgroundRemovalViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Image picker section + Section { + PhotosPicker(selection: $viewModel.selectedPhoto, + matching: .images) { + if let image = viewModel.inputImage { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(maxHeight: 250) + .cornerRadius(12) + } else { + placeholderView(title: "Select an Image", + systemImage: "photo.on.rectangle") + } + } + } header: { + sectionHeader("Input Image") + } + + // Process button + if viewModel.inputImage != nil { + Button(action: { viewModel.removeBackground() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "scissors") + } + Text(viewModel.isProcessing ? "Processing..." : "Remove Background") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.isProcessing ? Color.gray : Color.accentColor) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.isProcessing) + } + + // Progress indicator + if viewModel.isProcessing { + VStack(spacing: 8) { + ProgressView(value: viewModel.progress) + .progressViewStyle(.linear) + Text(viewModel.progressMessage) + .font(.caption) + .foregroundColor(.secondary) + } + .padding(.horizontal) + } + + // Error display + if let error = viewModel.errorMessage { + Text(error) + .foregroundColor(.red) + .font(.caption) + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // Display mode selector + if viewModel.maskImage != nil { + Section { + Picker("Display Mode", selection: $viewModel.displayMode) { + Text("Comparison").tag(DisplayMode.comparison) + Text("Mask").tag(DisplayMode.mask) + Text("Cutout").tag(DisplayMode.cutout) + } + .pickerStyle(.segmented) + } header: { + sectionHeader("View Mode") + } + } + + // Before / After comparison + if viewModel.displayMode == .comparison, + let original = viewModel.inputImage, + let cutout = viewModel.cutoutImage { + Section { + BeforeAfterView( + before: original, + after: cutout + ) + .frame(height: 300) + .cornerRadius(12) + } header: { + sectionHeader("Before / After") + } + } + + // Mask view + if viewModel.displayMode == .mask, + let mask = viewModel.maskImage { + Section { + Image(uiImage: mask) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + } header: { + sectionHeader("Segmentation Mask") + } + } + + // Cutout result + if viewModel.displayMode == .cutout, + let cutout = viewModel.cutoutImage { + Section { + VStack(spacing: 12) { + // Background color selector + HStack(spacing: 12) { + Text("Background:") + .font(.subheadline) + ForEach(BackgroundOption.allCases, id: \.self) { option in + Button(action: { + viewModel.backgroundOption = option + viewModel.updateCutout() + }) { + Circle() + .fill(option.color) + .frame(width: 30, height: 30) + .overlay( + Circle() + .stroke(viewModel.backgroundOption == option ? Color.accentColor : Color.clear, lineWidth: 3) + ) + .overlay( + option == .transparent ? + Image(systemName: "checkerboard.rectangle") + .font(.caption2) + .foregroundColor(.gray) : nil + ) + } + } + Spacer() + } + + // Cutout image with checkerboard for transparent + ZStack { + if viewModel.backgroundOption == .transparent { + CheckerboardView() + .frame(maxHeight: 300) + .cornerRadius(12) + } + Image(uiImage: cutout) + .resizable() + .scaledToFit() + .frame(maxHeight: 300) + .cornerRadius(12) + } + } + } header: { + sectionHeader("Cutout Result") + } + } + + // Save button + if viewModel.cutoutImage != nil { + Button(action: { viewModel.saveToPhotoLibrary() }) { + HStack { + Image(systemName: "square.and.arrow.down") + Text("Save to Photos") + } + .frame(maxWidth: .infinity) + .padding() + .background(Color.green) + .foregroundColor(.white) + .cornerRadius(12) + } + + if viewModel.savedSuccessfully { + Text("Saved to Photo Library!") + .foregroundColor(.green) + .font(.caption) + .transition(.opacity) + } + } + } + .padding() + } + .navigationTitle("BiRefNet Background Removal") + .navigationBarTitleDisplayMode(.inline) + } + } + + private func sectionHeader(_ title: String) -> some View { + HStack { + Text(title) + .font(.headline) + Spacer() + } + } + + private func placeholderView(title: String, systemImage: String) -> some View { + VStack(spacing: 12) { + Image(systemName: systemImage) + .font(.system(size: 40)) + .foregroundColor(.secondary) + Text(title) + .foregroundColor(.secondary) + } + .frame(maxWidth: .infinity) + .frame(height: 180) + .background(Color(.systemGray6)) + .cornerRadius(12) + } +} + +// MARK: - Display Mode + +enum DisplayMode { + case comparison + case mask + case cutout +} + +// MARK: - Background Options + +enum BackgroundOption: CaseIterable { + case transparent + case white + case black + case green + case blue + + var color: Color { + switch self { + case .transparent: return Color.clear + case .white: return Color.white + case .black: return Color.black + case .green: return Color.green + case .blue: return Color.blue + } + } + + var uiColor: UIColor? { + switch self { + case .transparent: return nil + case .white: return .white + case .black: return .black + case .green: return UIColor(red: 0, green: 0.8, blue: 0, alpha: 1) + case .blue: return UIColor(red: 0, green: 0.4, blue: 1, alpha: 1) + } + } +} + +// MARK: - ViewModel + +class BackgroundRemovalViewModel: ObservableObject { + @Published var selectedPhoto: PhotosPickerItem? { + didSet { loadImage() } + } + @Published var inputImage: UIImage? + @Published var maskImage: UIImage? + @Published var cutoutImage: UIImage? + @Published var isProcessing = false + @Published var progress: Double = 0.0 + @Published var progressMessage: String = "" + @Published var errorMessage: String? + @Published var displayMode: DisplayMode = .comparison + @Published var backgroundOption: BackgroundOption = .transparent + @Published var savedSuccessfully = false + + private var rawMaskData: [Float]? + private var maskWidth: Int = 0 + private var maskHeight: Int = 0 + + private func loadImage() { + guard let item = selectedPhoto else { return } + Task { + if let data = try? await item.loadTransferable(type: Data.self), + let image = UIImage(data: data) { + await MainActor.run { + self.inputImage = image + self.maskImage = nil + self.cutoutImage = nil + self.errorMessage = nil + self.savedSuccessfully = false + self.rawMaskData = nil + self.displayMode = .comparison + } + } + } + } + + func removeBackground() { + guard let inputImage = inputImage else { return } + isProcessing = true + errorMessage = nil + progress = 0.0 + progressMessage = "Loading model..." + + Task { + do { + let result = try await performSegmentation(image: inputImage) + await MainActor.run { + self.maskImage = result.mask + self.cutoutImage = result.cutout + self.isProcessing = false + self.progress = 1.0 + self.progressMessage = "Complete!" + self.displayMode = .comparison + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + self.progress = 0.0 + self.progressMessage = "" + } + } + } + } + + func updateCutout() { + guard let inputImage = inputImage, + let maskData = rawMaskData else { return } + let w = maskWidth + let h = maskHeight + cutoutImage = applyMask(to: inputImage, maskData: maskData, + maskWidth: w, maskHeight: h, + background: backgroundOption.uiColor) + } + + func saveToPhotoLibrary() { + guard let image = cutoutImage else { return } + PHPhotoLibrary.requestAuthorization(for: .addOnly) { status in + guard status == .authorized || status == .limited else { + DispatchQueue.main.async { + self.errorMessage = "Photo library access denied." + } + return + } + guard let pngData = image.pngData() else { + DispatchQueue.main.async { + self.errorMessage = "Failed to encode image." + } + return + } + PHPhotoLibrary.shared().performChanges { + let request = PHAssetCreationRequest.forAsset() + request.addResource(with: .photo, data: pngData, options: nil) + } completionHandler: { success, error in + DispatchQueue.main.async { + if success { + self.savedSuccessfully = true + DispatchQueue.main.asyncAfter(deadline: .now() + 2) { + self.savedSuccessfully = false + } + } else { + self.errorMessage = error?.localizedDescription ?? "Failed to save." + } + } + } + } + } + + // MARK: - Core ML Inference + + private func performSegmentation(image: UIImage) async throws -> (mask: UIImage, cutout: UIImage) { + // Load the CoreML model + guard let modelURL = Bundle.main.url(forResource: "BiRefNet", withExtension: "mlmodelc") else { + throw SegmentationError.modelNotFound( + "BiRefNet.mlmodelc not found in bundle. " + + "Please convert the BiRefNet model to CoreML format using convert_birefnet.py, " + + "then compile the .mlpackage and add it to the Xcode project." + ) + } + + await MainActor.run { + self.progress = 0.1 + self.progressMessage = "Loading model..." + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await MainActor.run { + self.progress = 0.3 + self.progressMessage = "Preparing image..." + } + + // Prepare input image (1, 3, 1024, 1024) + let targetSize = CGSize(width: 1024, height: 1024) + guard let resizedCG = image.resized(to: targetSize)?.cgImage else { + throw SegmentationError.imageProcessingFailed("Failed to resize input image") + } + + let inputArray = try MLMultiArray(shape: [1, 3, 1024, 1024], dataType: .float32) + fillMultiArrayFromImage(resizedCG, into: inputArray, size: 1024) + + await MainActor.run { + self.progress = 0.5 + self.progressMessage = "Running BiRefNet inference..." + } + + // Run inference + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "image": MLFeatureValue(multiArray: inputArray) + ]) + let prediction = try model.prediction(from: inputFeatures) + + await MainActor.run { + self.progress = 0.8 + self.progressMessage = "Generating mask..." + } + + // Extract mask output (1, 1, 1024, 1024), apply sigmoid + guard let outputArray = prediction.featureValue(for: "mask")?.multiArrayValue else { + throw SegmentationError.imageProcessingFailed("Failed to extract mask output from model") + } + + let width = 1024 + let height = 1024 + var maskData = [Float](repeating: 0, count: width * height) + + let outputPointer = outputArray.dataPointer.bindMemory(to: Float.self, capacity: width * height) + for i in 0..<(width * height) { + let raw = outputPointer[i] + maskData[i] = 1.0 / (1.0 + exp(-raw)) // sigmoid + } + + // Store raw mask for background option changes + await MainActor.run { + self.rawMaskData = maskData + self.maskWidth = width + self.maskHeight = height + } + + // Generate mask visualization image + let maskUIImage = maskToUIImage(maskData: maskData, width: width, height: height) + guard let finalMask = maskUIImage else { + throw SegmentationError.imageProcessingFailed("Failed to create mask image") + } + + await MainActor.run { + self.progress = 0.9 + self.progressMessage = "Applying mask to image..." + } + + // Apply mask to original image for cutout + let bgColor = await MainActor.run { self.backgroundOption.uiColor } + let cutoutUIImage = applyMask(to: image, maskData: maskData, + maskWidth: width, maskHeight: height, + background: bgColor) + guard let finalCutout = cutoutUIImage else { + throw SegmentationError.imageProcessingFailed("Failed to apply mask to image") + } + + return (mask: finalMask, cutout: finalCutout) + } + + // MARK: - Image Processing Helpers + + /// Fill MLMultiArray with pixel data from CGImage (RGB, normalized 0-1) + private func fillMultiArrayFromImage(_ cgImage: CGImage, into array: MLMultiArray, size: Int) { + let bytesPerPixel = 4 + let bytesPerRow = bytesPerPixel * size + var pixelData = [UInt8](repeating: 0, count: size * size * bytesPerPixel) + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: size, height: size, + bitsPerComponent: 8, bytesPerRow: bytesPerRow, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + + let ptr = array.dataPointer.bindMemory(to: Float.self, capacity: 3 * size * size) + let channelStride = size * size + + for y in 0.. UIImage? { + var pixelData = [UInt8](repeating: 0, count: width * height * 4) + + for i in 0..<(width * height) { + let val = UInt8(min(max(maskData[i], 0), 1) * 255) + pixelData[i * 4] = val + pixelData[i * 4 + 1] = val + pixelData[i * 4 + 2] = val + pixelData[i * 4 + 3] = 255 + } + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: width, height: height, + bitsPerComponent: 8, bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ), let cgImage = context.makeImage() else { return nil } + + return UIImage(cgImage: cgImage) + } + + /// Apply the segmentation mask to the original image + /// If background color is nil, the result has transparency (PNG-friendly). + private func applyMask(to image: UIImage, maskData: [Float], + maskWidth: Int, maskHeight: Int, + background: UIColor?) -> UIImage? { + let origWidth = Int(image.size.width) + let origHeight = Int(image.size.height) + + guard let cgImage = image.cgImage else { return nil } + + let bytesPerPixel = 4 + let bytesPerRow = bytesPerPixel * origWidth + var pixelData = [UInt8](repeating: 0, count: origWidth * origHeight * bytesPerPixel) + + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard let context = CGContext( + data: &pixelData, width: origWidth, height: origHeight, + bitsPerComponent: 8, bytesPerRow: bytesPerRow, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: origWidth, height: origHeight)) + + // Determine background RGBA + var bgR: UInt8 = 0, bgG: UInt8 = 0, bgB: UInt8 = 0, bgA: UInt8 = 0 + if let bg = background { + var r: CGFloat = 0, g: CGFloat = 0, b: CGFloat = 0, a: CGFloat = 0 + bg.getRed(&r, green: &g, blue: &b, alpha: &a) + bgR = UInt8(r * 255) + bgG = UInt8(g * 255) + bgB = UInt8(b * 255) + bgA = UInt8(a * 255) + } + + var outputData = [UInt8](repeating: 0, count: origWidth * origHeight * bytesPerPixel) + + for y in 0.. UIImage? { + let renderer = UIGraphicsImageRenderer(size: targetSize) + return renderer.image { _ in + self.draw(in: CGRect(origin: .zero, size: targetSize)) + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/Info.plist b/creative_apps/BiRefNetDemo/BiRefNetDemo/Info.plist new file mode 100644 index 0000000..243640b --- /dev/null +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs photo library access for selecting images to remove backgrounds. + + diff --git a/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj b/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..234a9c3 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + DP0001 /* DepthProDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0002; }; + DP0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0004; }; + DP0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DP0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + DP0007 /* DepthProDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthProDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + DP0002 /* DepthProDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthProDemoApp.swift; sourceTree = ""; }; + DP0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + DP0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + DP0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + DP0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + DP0010 = { + isa = PBXGroup; + children = ( + DP0011 /* DepthProDemo */, + DP0012 /* Products */, + ); + sourceTree = ""; + }; + DP0011 /* DepthProDemo */ = { + isa = PBXGroup; + children = ( + DP0002 /* DepthProDemoApp.swift */, + DP0004 /* ContentView.swift */, + DP0006 /* Assets.xcassets */, + DP0008 /* Info.plist */, + ); + path = DepthProDemo; + sourceTree = ""; + }; + DP0012 /* Products */ = { + isa = PBXGroup; + children = ( + DP0007 /* DepthProDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + DP0013 /* DepthProDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = DP0014; + buildPhases = ( + DP0015 /* Sources */, + DP0009 /* Frameworks */, + DP0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = DepthProDemo; + productName = DepthProDemo; + productReference = DP0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + DP0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + DP0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = DP0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = DP0010; + productRefGroup = DP0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + DP0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + DP0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DP0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + DP0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DP0001 /* DepthProDemoApp.swift in Sources */, + DP0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + DP0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + DP0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + DP0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DepthProDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthprodemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + DP0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DepthProDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthprodemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + DP0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DP0019, + DP0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + DP0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DP0021, + DP0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = DP0017; +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift new file mode 100644 index 0000000..8244c85 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift @@ -0,0 +1,864 @@ +import SwiftUI +import UIKit +import CoreML +import PhotosUI +import CoreMotion + +// MARK: - Apple Depth Pro - Metric Depth Estimation Demo +// +// Depth Pro produces metric (absolute) depth maps from a single image, +// along with an estimated focal length. Input: 1536x1536 RGB image. +// Outputs: depth map (meters) + focal length (pixels). +// +// Features: +// - PhotosPicker for image selection +// - Color-coded depth visualization (turbo colormap) +// - Tap to measure distance at any point +// - 3D parallax effect using CoreMotion +// - Before/After depth overlay slider +// - Focal length display +// - Save depth map as image + +// MARK: - Turbo Colormap + +struct TurboColormap { + /// Maps a normalized value [0,1] to a turbo colormap RGB tuple. + /// Blue = far (0.0), Red = near (1.0). + static func color(for value: Float) -> (r: UInt8, g: UInt8, b: UInt8) { + let t = max(0, min(1, value)) + let r = clampByte(34.61 + t * (1172.33 - t * (10793.56 - t * (33300.12 - t * (38394.49 - t * 14825.05))))) + let g = clampByte(23.31 + t * (557.33 + t * (1225.33 - t * (3574.96 - t * (1073.77 + t * 707.56))))) + let b = clampByte(27.2 + t * (3211.1 - t * (15327.97 - t * (27814.0 - t * (22569.18 - t * 6838.66))))) + return (r, g, b) + } + + private static func clampByte(_ v: Float) -> UInt8 { + return UInt8(max(0, min(255, Int(v)))) + } + + /// Generates a UIImage depth visualization from a depth float buffer. + static func depthMapImage(from depthValues: [Float], width: Int, height: Int, minDepth: Float, maxDepth: Float) -> UIImage? { + let count = width * height + guard depthValues.count >= count else { return nil } + + var pixelData = [UInt8](repeating: 255, count: count * 4) + let range = maxDepth - minDepth + let safeRange = range > 0 ? range : 1.0 + + for i in 0..= 0 && normX <= 1 && normY >= 0 && normY <= 1 { + viewModel.measureDepth( + atNormalized: CGPoint(x: normX, y: normY), + viewLocation: value.location + ) + } + } + ) + + // Measurement indicator + if let measurement = viewModel.pointMeasurement { + VStack(spacing: 2) { + Image(systemName: "mappin.circle.fill") + .font(.title2) + .foregroundColor(.white) + .shadow(radius: 3) + Text(String(format: "%.2f m", measurement.depth)) + .font(.caption) + .fontWeight(.bold) + .foregroundColor(.white) + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(Color.black.opacity(0.75)) + .cornerRadius(8) + } + .position(x: measurement.viewPoint.x, + y: max(40, measurement.viewPoint.y - 30)) + } + } + .aspectRatio(depthImg.size.width / depthImg.size.height, contentMode: .fit) + } + } + } + } + + // MARK: - 3D Parallax Section + + private var parallaxSection: some View { + VStack(spacing: 12) { + Text("3D Parallax Effect") + .font(.headline) + .frame(maxWidth: .infinity, alignment: .leading) + + Text("Tilt your device to see the depth-based parallax effect.") + .font(.caption) + .foregroundColor(.secondary) + .frame(maxWidth: .infinity, alignment: .leading) + + ZStack { + if let original = viewModel.selectedImage { + Image(uiImage: original) + .resizable() + .aspectRatio(contentMode: .fit) + .cornerRadius(12) + .offset( + x: CGFloat(motionManager.roll) * -8, + y: CGFloat(motionManager.pitch) * -8 + ) + } + + if let depthImg = viewModel.depthMapImage { + Image(uiImage: depthImg) + .resizable() + .aspectRatio(contentMode: .fit) + .cornerRadius(12) + .opacity(0.45) + .offset( + x: CGFloat(motionManager.roll) * 15, + y: CGFloat(motionManager.pitch) * 15 + ) + } + } + .clipped() + .cornerRadius(12) + .shadow(color: .black.opacity(0.15), radius: 8, y: 4) + } + } + + // MARK: - Save Section + + private var saveSection: some View { + VStack(spacing: 12) { + Button(action: { viewModel.saveDepthMap() }) { + HStack { + Image(systemName: viewModel.didSave ? "checkmark.circle.fill" : "square.and.arrow.down") + Text(viewModel.didSave ? "Saved to Photos" : "Save Depth Map") + } + .frame(maxWidth: .infinity) + .padding() + .background(viewModel.didSave ? Color.green : Color(.systemGray5)) + .foregroundColor(viewModel.didSave ? .white : .primary) + .cornerRadius(12) + } + .disabled(viewModel.didSave) + } + } + + // MARK: - Colormap Legend + + private var colormapLegendSection: some View { + VStack(spacing: 8) { + Text("Depth Colormap Legend") + .font(.caption) + .foregroundColor(.secondary) + .frame(maxWidth: .infinity, alignment: .leading) + + GeometryReader { geo in + let width = geo.size.width + HStack(spacing: 0) { + ForEach(0.. some View { + VStack(spacing: 4) { + Text(label) + .font(.caption2) + .foregroundColor(.secondary) + Text(String(format: "%.2f m", value)) + .font(.subheadline) + .fontWeight(.semibold) + .foregroundColor(color) + } + .frame(maxWidth: .infinity) + } +} + +// MARK: - Point Measurement Data + +struct PointMeasurement { + let depth: Float + let normalizedPoint: CGPoint + let viewPoint: CGPoint +} + +// MARK: - Depth Statistics + +struct DepthStats { + let min: Float + let max: Float + let mean: Float +} + +// MARK: - DepthPro ViewModel + +class DepthProViewModel: ObservableObject { + @Published var photoItem: PhotosPickerItem? { + didSet { loadImage() } + } + @Published var selectedImage: UIImage? + @Published var depthMapImage: UIImage? + @Published var depthValues: [Float] = [] + @Published var depthWidth: Int = 0 + @Published var depthHeight: Int = 0 + @Published var estimatedFocalLength: Float? + @Published var depthStats: DepthStats? + @Published var isProcessing = false + @Published var isProcessed = false + @Published var progress: Double = 0 + @Published var statusMessage = "" + @Published var errorMessage: String? + @Published var overlaySlider: Double = 0.5 + @Published var pointMeasurement: PointMeasurement? + @Published var didSave = false + + private func loadImage() { + guard let item = photoItem else { return } + reset() + Task { + do { + if let data = try await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) { + await MainActor.run { + self.selectedImage = uiImage + } + } + } catch { + await MainActor.run { + self.errorMessage = "Failed to load image: \(error.localizedDescription)" + } + } + } + } + + func reset() { + depthMapImage = nil + depthValues = [] + depthWidth = 0 + depthHeight = 0 + estimatedFocalLength = nil + depthStats = nil + isProcessed = false + isProcessing = false + progress = 0 + statusMessage = "" + errorMessage = nil + pointMeasurement = nil + didSave = false + } + + func estimateDepth() { + guard selectedImage != nil else { return } + isProcessing = true + errorMessage = nil + progress = 0 + + Task { + do { + try await performDepthEstimation() + await MainActor.run { + self.isProcessed = true + self.isProcessing = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + } + } + + // MARK: - Core ML Inference + + private func performDepthEstimation() async throws { + await updateStatus("Loading Depth Pro model...", progress: 0.1) + + guard let modelURL = Bundle.main.url(forResource: "DepthPro", withExtension: "mlmodelc") else { + throw DepthProError.modelNotFound( + "DepthPro.mlmodelc not found in bundle. " + + "Please convert the model using convert_depth_pro.py, " + + "compile the .mlpackage, and add DepthPro.mlmodelc to the project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await updateStatus("Preprocessing image...", progress: 0.3) + + guard let inputImage = selectedImage else { + throw DepthProError.processingFailed("No image selected.") + } + + // Resize to 1536x1536 for model input + let targetSize = CGSize(width: 1536, height: 1536) + guard let resizedImage = resizeImage(inputImage, to: targetSize), + let pixelBuffer = pixelBufferFromImage(resizedImage, size: targetSize) else { + throw DepthProError.processingFailed("Failed to preprocess image for model input.") + } + + await updateStatus("Running depth estimation...", progress: 0.5) + + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "image": MLFeatureValue(pixelBuffer: pixelBuffer) + ]) + + let result = try model.prediction(from: inputFeatures) + + await updateStatus("Processing depth output...", progress: 0.8) + + // Extract depth map + guard let depthMultiArray = result.featureValue(for: "depth")?.multiArrayValue else { + throw DepthProError.processingFailed("Model did not produce a depth output.") + } + + // Extract focal length if available + var focalLength: Float? = nil + if let focalArray = result.featureValue(for: "focallength")?.multiArrayValue { + focalLength = focalArray[0].floatValue + } + + // Parse depth map dimensions + let shape = depthMultiArray.shape.map { $0.intValue } + let dH: Int + let dW: Int + if shape.count == 4 { + dH = shape[2] + dW = shape[3] + } else if shape.count == 3 { + dH = shape[1] + dW = shape[2] + } else if shape.count == 2 { + dH = shape[0] + dW = shape[1] + } else { + dH = 1536 + dW = 1536 + } + + // Copy depth values + let totalPixels = dH * dW + let pointer = depthMultiArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels) + var depths = [Float](repeating: 0, count: totalPixels) + for i in 0.. maxD { maxD = d } + sumD += d + } + let meanD = sumD / Float(totalPixels) + + // Generate colorized depth image + let depthImage = TurboColormap.depthMapImage(from: depths, width: dW, height: dH, minDepth: minD, maxDepth: maxD) + + await updateStatus("Complete!", progress: 1.0) + + await MainActor.run { + self.depthValues = depths + self.depthWidth = dW + self.depthHeight = dH + self.depthMapImage = depthImage + self.estimatedFocalLength = focalLength + self.depthStats = DepthStats(min: minD, max: maxD, mean: meanD) + } + } + + // MARK: - Measure Depth at Point + + func measureDepth(atNormalized point: CGPoint, viewLocation: CGPoint) { + guard !depthValues.isEmpty, depthWidth > 0, depthHeight > 0 else { return } + + let px = Int(point.x * CGFloat(depthWidth)) + let py = Int(point.y * CGFloat(depthHeight)) + let clampedX = max(0, min(depthWidth - 1, px)) + let clampedY = max(0, min(depthHeight - 1, py)) + let index = clampedY * depthWidth + clampedX + + guard index >= 0 && index < depthValues.count else { return } + let depth = depthValues[index] + + pointMeasurement = PointMeasurement( + depth: depth, + normalizedPoint: point, + viewPoint: viewLocation + ) + } + + // MARK: - Save Depth Map + + func saveDepthMap() { + guard let image = depthMapImage else { return } + UIImageWriteToSavedPhotosAlbum(image, nil, nil, nil) + didSave = true + } + + // MARK: - Image Utilities + + private func resizeImage(_ image: UIImage, to size: CGSize) -> UIImage? { + UIGraphicsBeginImageContextWithOptions(size, true, 1.0) + image.draw(in: CGRect(origin: .zero, size: size)) + let resized = UIGraphicsGetImageFromCurrentImageContext() + UIGraphicsEndImageContext() + return resized + } + + private func pixelBufferFromImage(_ image: UIImage, size: CGSize) -> CVPixelBuffer? { + let width = Int(size.width) + let height = Int(size.height) + let attrs: [CFString: Any] = [ + kCVPixelBufferCGImageCompatibilityKey: true, + kCVPixelBufferCGBitmapContextCompatibilityKey: true + ] + var pixelBuffer: CVPixelBuffer? + let status = CVPixelBufferCreate( + kCFAllocatorDefault, width, height, + kCVPixelFormatType_32BGRA, + attrs as CFDictionary, + &pixelBuffer + ) + guard status == kCVReturnSuccess, let buffer = pixelBuffer else { return nil } + + CVPixelBufferLockBaseAddress(buffer, []) + defer { CVPixelBufferUnlockBaseAddress(buffer, []) } + + guard let context = CGContext( + data: CVPixelBufferGetBaseAddress(buffer), + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: CVPixelBufferGetBytesPerRow(buffer), + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGBitmapInfo.byteOrder32Little.rawValue | CGImageAlphaInfo.premultipliedFirst.rawValue + ) else { return nil } + + guard let cgImage = image.cgImage else { return nil } + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + return buffer + } + + // MARK: - Status Updates + + @MainActor + private func updateStatus(_ message: String, progress: Double) { + self.statusMessage = message + self.progress = progress + } +} + +// MARK: - Errors + +enum DepthProError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +// MARK: - Preview + +#Preview { + ContentView() +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift b/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift new file mode 100644 index 0000000..a306119 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct DepthProDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Info.plist b/creative_apps/DepthProDemo/DepthProDemo/Info.plist new file mode 100644 index 0000000..8e27fd4 --- /dev/null +++ b/creative_apps/DepthProDemo/DepthProDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSPhotoLibraryUsageDescription + This app needs photo library access for selecting images for depth estimation. + + diff --git a/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj b/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..e8656e1 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + KK0001 /* KokoroDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = KK0002; }; + KK0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = KK0004; }; + KK0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = KK0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + KK0007 /* KokoroDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = KokoroDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + KK0002 /* KokoroDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KokoroDemoApp.swift; sourceTree = ""; }; + KK0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + KK0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + KK0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + KK0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + KK0010 = { + isa = PBXGroup; + children = ( + KK0011 /* KokoroDemo */, + KK0012 /* Products */, + ); + sourceTree = ""; + }; + KK0011 /* KokoroDemo */ = { + isa = PBXGroup; + children = ( + KK0002 /* KokoroDemoApp.swift */, + KK0004 /* ContentView.swift */, + KK0006 /* Assets.xcassets */, + KK0008 /* Info.plist */, + ); + path = KokoroDemo; + sourceTree = ""; + }; + KK0012 /* Products */ = { + isa = PBXGroup; + children = ( + KK0007 /* KokoroDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + KK0013 /* KokoroDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = KK0014; + buildPhases = ( + KK0015 /* Sources */, + KK0009 /* Frameworks */, + KK0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = KokoroDemo; + productName = KokoroDemo; + productReference = KK0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + KK0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + KK0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = KK0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = KK0010; + productRefGroup = KK0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + KK0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + KK0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + KK0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + KK0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + KK0001 /* KokoroDemoApp.swift in Sources */, + KK0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + KK0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + KK0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + KK0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = KokoroDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.kokorodemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + KK0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = KokoroDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.kokorodemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + KK0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + KK0019, + KK0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + KK0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + KK0021, + KK0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = KK0017; +} diff --git a/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/Contents.json b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/KokoroDemo/KokoroDemo/ContentView.swift b/creative_apps/KokoroDemo/KokoroDemo/ContentView.swift new file mode 100644 index 0000000..e539155 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/ContentView.swift @@ -0,0 +1,958 @@ +import SwiftUI +import CoreML +import AVFoundation + +// MARK: - Kokoro-82M Text-to-Speech Demo +// +// Kokoro-82M is a lightweight TTS model based on StyleTTS2 architecture with +// an ISTFTNet decoder. It supports multiple voices across US English, UK English, +// and Japanese. The model takes phoneme tokens and a voice style embedding as +// input and produces a raw audio waveform at 24kHz. +// +// Pre-converted CoreML model: https://huggingface.co/FluidInference/kokoro-82m-coreml +// iOS Swift package: https://github.com/mlalma/kokoro-ios +// +// This demo provides the full UI flow. A production implementation would use the +// kokoro-ios Swift package for the phonemizer and full inference pipeline. + +// MARK: - Voice Data Model + +enum VoiceCategory: String, CaseIterable, Identifiable { + case usEnglishFemale = "US English (Female)" + case usEnglishMale = "US English (Male)" + case ukEnglishFemale = "UK English (Female)" + case ukEnglishMale = "UK English (Male)" + case japanese = "Japanese" + + var id: String { rawValue } +} + +struct KokoroVoice: Identifiable, Hashable { + let id: String + let displayName: String + let category: VoiceCategory + let languageCode: String + + var flag: String { + switch category { + case .usEnglishFemale, .usEnglishMale: return "🇺🇸" + case .ukEnglishFemale, .ukEnglishMale: return "🇬🇧" + case .japanese: return "🇯🇵" + } + } +} + +let availableVoices: [KokoroVoice] = [ + // US English Female + KokoroVoice(id: "af_heart", displayName: "Heart", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_bella", displayName: "Bella", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_nicole", displayName: "Nicole", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_aoede", displayName: "Aoede", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_kore", displayName: "Kore", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_sarah", displayName: "Sarah", category: .usEnglishFemale, languageCode: "en-us"), + KokoroVoice(id: "af_sky", displayName: "Sky", category: .usEnglishFemale, languageCode: "en-us"), + // US English Male + KokoroVoice(id: "am_adam", displayName: "Adam", category: .usEnglishMale, languageCode: "en-us"), + KokoroVoice(id: "am_michael", displayName: "Michael", category: .usEnglishMale, languageCode: "en-us"), + KokoroVoice(id: "am_echo", displayName: "Echo", category: .usEnglishMale, languageCode: "en-us"), + KokoroVoice(id: "am_liam", displayName: "Liam", category: .usEnglishMale, languageCode: "en-us"), + // UK English Female + KokoroVoice(id: "bf_emma", displayName: "Emma", category: .ukEnglishFemale, languageCode: "en-gb"), + KokoroVoice(id: "bf_isabella", displayName: "Isabella", category: .ukEnglishFemale, languageCode: "en-gb"), + // UK English Male + KokoroVoice(id: "bm_george", displayName: "George", category: .ukEnglishMale, languageCode: "en-gb"), + KokoroVoice(id: "bm_lewis", displayName: "Lewis", category: .ukEnglishMale, languageCode: "en-gb"), + // Japanese + KokoroVoice(id: "jf_alpha", displayName: "Alpha", category: .japanese, languageCode: "ja"), + KokoroVoice(id: "jf_gongitsune", displayName: "Gongitsune", category: .japanese, languageCode: "ja"), + KokoroVoice(id: "jm_kumo", displayName: "Kumo", category: .japanese, languageCode: "ja"), +] + +// MARK: - Playback State + +enum PlaybackState: Equatable { + case idle + case playing + case paused +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var viewModel = KokoroViewModel() + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Text input section + textInputSection + + // Voice selection section + voiceSelectionSection + + // Speed control section + speedControlSection + + // Generate button + generateButton + + // Progress indicator + if viewModel.isGenerating { + progressSection + } + + // Error display + if let error = viewModel.errorMessage { + errorSection(error) + } + + // Playback controls + if viewModel.hasGeneratedAudio { + waveformSection + playbackControlsSection + saveButton + } + } + .padding() + } + .navigationTitle("Kokoro TTS") + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Menu { + Section("About") { + Label("Kokoro-82M", systemImage: "info.circle") + Label("StyleTTS2 Architecture", systemImage: "cpu") + Label("24kHz Output", systemImage: "waveform") + } + } label: { + Image(systemName: "ellipsis.circle") + } + } + } + } + } + + // MARK: - Text Input + + private var textInputSection: some View { + VStack(alignment: .leading, spacing: 8) { + Label("Text to Speak", systemImage: "text.alignleft") + .font(.headline) + + TextEditor(text: $viewModel.inputText) + .frame(minHeight: 120, maxHeight: 200) + .padding(8) + .background(Color(.systemGray6)) + .cornerRadius(12) + .overlay( + RoundedRectangle(cornerRadius: 12) + .stroke(Color(.systemGray4), lineWidth: 1) + ) + .overlay(alignment: .topLeading) { + if viewModel.inputText.isEmpty { + Text("Enter text to synthesize speech...") + .foregroundColor(.secondary) + .padding(.horizontal, 12) + .padding(.vertical, 16) + .allowsHitTesting(false) + } + } + + HStack { + Text("\(viewModel.inputText.count) characters") + .font(.caption) + .foregroundColor(.secondary) + Spacer() + Button("Clear") { + viewModel.inputText = "" + } + .font(.caption) + .disabled(viewModel.inputText.isEmpty) + } + } + } + + // MARK: - Voice Selection + + private var voiceSelectionSection: some View { + VStack(alignment: .leading, spacing: 8) { + Label("Voice", systemImage: "person.wave.2") + .font(.headline) + + // Category picker + Picker("Category", selection: $viewModel.selectedCategory) { + ForEach(VoiceCategory.allCases) { category in + Text(category.rawValue).tag(category) + } + } + .pickerStyle(.menu) + + // Voice list for selected category + let filteredVoices = availableVoices.filter { + $0.category == viewModel.selectedCategory + } + + ScrollView(.horizontal, showsIndicators: false) { + HStack(spacing: 10) { + ForEach(filteredVoices) { voice in + VoiceChipView( + voice: voice, + isSelected: viewModel.selectedVoice.id == voice.id, + onTap: { viewModel.selectedVoice = voice } + ) + } + } + } + + HStack(spacing: 6) { + Text(viewModel.selectedVoice.flag) + Text(viewModel.selectedVoice.displayName) + .fontWeight(.medium) + Text("(\(viewModel.selectedVoice.id))") + .foregroundColor(.secondary) + } + .font(.subheadline) + } + } + + // MARK: - Speed Control + + private var speedControlSection: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + Label("Speed", systemImage: "gauge.with.dots.needle.67percent") + .font(.headline) + Spacer() + Text(String(format: "%.1fx", viewModel.speed)) + .font(.subheadline) + .fontWeight(.semibold) + .foregroundColor(.accentColor) + .monospacedDigit() + } + + HStack(spacing: 12) { + Text("0.5x") + .font(.caption2) + .foregroundColor(.secondary) + Slider(value: $viewModel.speed, in: 0.5...2.0, step: 0.1) + .tint(.accentColor) + Text("2.0x") + .font(.caption2) + .foregroundColor(.secondary) + } + + HStack(spacing: 8) { + ForEach([0.5, 0.75, 1.0, 1.25, 1.5, 2.0], id: \.self) { preset in + Button { + viewModel.speed = preset + } label: { + Text(String(format: "%.1fx", preset)) + .font(.caption2) + .fontWeight(viewModel.speed == preset ? .bold : .regular) + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background( + viewModel.speed == preset + ? Color.accentColor.opacity(0.2) + : Color(.systemGray5) + ) + .foregroundColor( + viewModel.speed == preset + ? .accentColor + : .primary + ) + .cornerRadius(6) + } + } + Spacer() + } + } + } + + // MARK: - Generate Button + + private var generateButton: some View { + Button(action: { viewModel.generateSpeech() }) { + HStack(spacing: 10) { + if viewModel.isGenerating { + ProgressView() + .tint(.white) + } else { + Image(systemName: "waveform.and.mic") + } + Text(viewModel.isGenerating ? "Generating..." : "Speak") + .fontWeight(.semibold) + } + .frame(maxWidth: .infinity) + .padding() + .background( + viewModel.canGenerate && !viewModel.isGenerating + ? Color.accentColor + : Color.gray + ) + .foregroundColor(.white) + .cornerRadius(14) + } + .disabled(!viewModel.canGenerate || viewModel.isGenerating) + } + + // MARK: - Progress + + private var progressSection: some View { + VStack(spacing: 8) { + ProgressView(value: viewModel.progress) + .progressViewStyle(.linear) + .tint(.accentColor) + Text(viewModel.statusMessage) + .font(.caption) + .foregroundColor(.secondary) + } + } + + // MARK: - Error + + private func errorSection(_ message: String) -> some View { + HStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.red) + Text(message) + .font(.caption) + .foregroundColor(.red) + } + .padding() + .frame(maxWidth: .infinity, alignment: .leading) + .background(Color.red.opacity(0.1)) + .cornerRadius(10) + } + + // MARK: - Waveform Visualization + + private var waveformSection: some View { + VStack(alignment: .leading, spacing: 8) { + Label("Waveform", systemImage: "waveform") + .font(.headline) + + WaveformVisualization( + samples: viewModel.waveformSamples, + playbackProgress: viewModel.playbackProgress, + isPlaying: viewModel.playbackState == .playing + ) + .frame(height: 100) + .background(Color(.systemGray6)) + .cornerRadius(12) + + if let duration = viewModel.audioDuration { + HStack { + Text(viewModel.formattedCurrentTime) + .font(.caption) + .monospacedDigit() + .foregroundColor(.secondary) + Spacer() + Text(formatDuration(duration)) + .font(.caption) + .monospacedDigit() + .foregroundColor(.secondary) + } + } + } + } + + // MARK: - Playback Controls + + private var playbackControlsSection: some View { + HStack(spacing: 30) { + Spacer() + + // Stop + Button(action: { viewModel.stopPlayback() }) { + Image(systemName: "stop.fill") + .font(.title2) + .foregroundColor( + viewModel.playbackState != .idle ? .primary : .gray + ) + } + .disabled(viewModel.playbackState == .idle) + + // Play / Pause + Button(action: { + if viewModel.playbackState == .playing { + viewModel.pausePlayback() + } else { + viewModel.playAudio() + } + }) { + Image(systemName: viewModel.playbackState == .playing + ? "pause.circle.fill" + : "play.circle.fill") + .font(.system(size: 52)) + .foregroundColor(.accentColor) + } + + // Stop + Button(action: { viewModel.stopPlayback() }) { + Image(systemName: "stop.circle.fill") + .font(.title2) + .foregroundColor( + viewModel.playbackState != .idle ? .red : .gray + ) + } + .disabled(viewModel.playbackState == .idle) + + Spacer() + } + .padding(.vertical, 8) + } + + // MARK: - Save Button + + private var saveButton: some View { + Button(action: { viewModel.saveAudioToFiles() }) { + HStack { + Image(systemName: "square.and.arrow.down") + Text("Save Audio") + } + .frame(maxWidth: .infinity) + .padding() + .background(Color(.systemGray6)) + .foregroundColor(.accentColor) + .cornerRadius(12) + .overlay( + RoundedRectangle(cornerRadius: 12) + .stroke(Color.accentColor.opacity(0.3), lineWidth: 1) + ) + } + } + + private func formatDuration(_ duration: TimeInterval) -> String { + let minutes = Int(duration) / 60 + let seconds = Int(duration) % 60 + let millis = Int((duration.truncatingRemainder(dividingBy: 1)) * 100) + return String(format: "%d:%02d.%02d", minutes, seconds, millis) + } +} + +// MARK: - Voice Chip View + +struct VoiceChipView: View { + let voice: KokoroVoice + let isSelected: Bool + let onTap: () -> Void + + var body: some View { + Button(action: onTap) { + HStack(spacing: 6) { + Text(voice.flag) + .font(.caption) + Text(voice.displayName) + .font(.subheadline) + .fontWeight(isSelected ? .semibold : .regular) + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background( + isSelected + ? Color.accentColor.opacity(0.15) + : Color(.systemGray6) + ) + .foregroundColor(isSelected ? .accentColor : .primary) + .cornerRadius(20) + .overlay( + RoundedRectangle(cornerRadius: 20) + .stroke( + isSelected ? Color.accentColor : Color.clear, + lineWidth: 1.5 + ) + ) + } + } +} + +// MARK: - Waveform Visualization + +struct WaveformVisualization: View { + let samples: [Float] + let playbackProgress: Double + let isPlaying: Bool + + var body: some View { + GeometryReader { geo in + let barCount = Int(geo.size.width / 3) + let midY = geo.size.height / 2 + + Canvas { context, size in + guard !samples.isEmpty else { + // Draw flat line when no samples + var path = Path() + path.move(to: CGPoint(x: 0, y: midY)) + path.addLine(to: CGPoint(x: size.width, y: midY)) + context.stroke(path, with: .color(.gray.opacity(0.3)), lineWidth: 1) + return + } + + let step = max(1, samples.count / barCount) + let progressX = size.width * playbackProgress + + for i in 0.. 0 && playbackProgress < 1 { + var playhead = Path() + playhead.move(to: CGPoint(x: progressX, y: 0)) + playhead.addLine(to: CGPoint(x: progressX, y: size.height)) + context.stroke( + playhead, + with: .color(.accentColor), + lineWidth: 1.5 + ) + } + } + } + .padding(8) + } +} + +// MARK: - Simplified Phoneme Tokenizer +// +// Kokoro uses phoneme-based input tokens. In production, use a full G2P +// (grapheme-to-phoneme) library or espeak-ng for accurate conversion. +// This simplified tokenizer maps basic English text to approximate phoneme tokens. + +struct SimplePhonemeTokenizer { + // Simplified phoneme vocabulary mapping (subset of IPA) + // In production, use espeak-ng or the kokoro-ios package phonemizer + private static let charToPhoneme: [Character: [Int]] = { + var map: [Character: [Int]] = [:] + let alphabet = "abcdefghijklmnopqrstuvwxyz" + // Simple one-to-one mapping for demo purposes + // Real Kokoro uses IPA phonemes from espeak-ng + for (index, char) in alphabet.enumerated() { + map[char] = [index + 1] // Token IDs start at 1, 0 = padding + } + map[" "] = [27] // space token + map["."] = [28] // period / sentence boundary + map[","] = [29] // comma / pause + map["!"] = [30] + map["?"] = [31] + return map + }() + + /// Convert text to simplified phoneme token IDs + /// In production, this would use espeak-ng for proper G2P conversion + static func tokenize(_ text: String) -> [Int] { + let cleaned = text.lowercased() + .filter { $0.isLetter || $0.isWhitespace || ".!?,".contains($0) } + + var tokens: [Int] = [] + for char in cleaned { + if let phonemeIDs = charToPhoneme[char] { + tokens.append(contentsOf: phonemeIDs) + } + } + + // Kokoro model expects a maximum sequence length + // Truncate to 510 tokens (with start/end tokens = 512) + if tokens.count > 510 { + tokens = Array(tokens.prefix(510)) + } + + return tokens + } +} + +// MARK: - ViewModel + +class KokoroViewModel: ObservableObject { + @Published var inputText: String = "Hello! This is a demonstration of the Kokoro text to speech model running on device with CoreML." + @Published var selectedCategory: VoiceCategory = .usEnglishFemale + @Published var selectedVoice: KokoroVoice = availableVoices[0] + @Published var speed: Double = 1.0 + @Published var isGenerating = false + @Published var progress: Double = 0 + @Published var statusMessage = "" + @Published var errorMessage: String? + @Published var hasGeneratedAudio = false + @Published var playbackState: PlaybackState = .idle + @Published var playbackProgress: Double = 0 + @Published var audioDuration: TimeInterval? + @Published var waveformSamples: [Float] = [] + @Published var showShareSheet = false + + private var audioEngine: AVAudioEngine? + private var playerNode: AVAudioPlayerNode? + private var audioBuffer: AVAudioPCMBuffer? + private var displayLink: CADisplayLink? + private var playbackStartTime: TimeInterval = 0 + private var pausedTime: TimeInterval = 0 + private var generatedAudioURL: URL? + + var canGenerate: Bool { + !inputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + + var formattedCurrentTime: String { + guard let duration = audioDuration else { return "0:00.00" } + let current = duration * playbackProgress + let minutes = Int(current) / 60 + let seconds = Int(current) % 60 + let millis = Int((current.truncatingRemainder(dividingBy: 1)) * 100) + return String(format: "%d:%02d.%02d", minutes, seconds, millis) + } + + // MARK: - Speech Generation + + func generateSpeech() { + guard canGenerate else { return } + + stopPlayback() + isGenerating = true + errorMessage = nil + hasGeneratedAudio = false + progress = 0 + waveformSamples = [] + + Task { + do { + try await performGeneration() + await MainActor.run { + self.hasGeneratedAudio = true + self.isGenerating = false + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isGenerating = false + } + } + } + } + + /// Perform TTS generation using the Kokoro CoreML model + /// + /// Full pipeline overview: + /// 1. Tokenize input text to phoneme IDs using G2P (grapheme-to-phoneme) + /// 2. Load voice style embedding vector for the selected voice + /// 3. Run duration predictor to determine phoneme timings + /// 4. Run decoder (ISTFTNet) to synthesize the audio waveform at 24kHz + /// 5. Apply speed factor by adjusting duration predictions + /// + /// This demo loads the model and prepares inputs; a production app + /// should use the kokoro-ios Swift package for the full pipeline. + private func performGeneration() async throws { + await updateStatus("Loading model...", progress: 0.1) + + guard let modelURL = Bundle.main.url(forResource: "Kokoro82M", withExtension: "mlmodelc") else { + throw KokoroError.modelNotFound( + "Kokoro82M.mlmodelc not found in bundle. " + + "Download the CoreML model from huggingface.co/FluidInference/kokoro-82m-coreml " + + "and add it to the Xcode project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await updateStatus("Tokenizing text...", progress: 0.25) + + // Tokenize input text to phoneme IDs + let tokens = SimplePhonemeTokenizer.tokenize(inputText) + + guard !tokens.isEmpty else { + throw KokoroError.processingFailed("No valid tokens produced from input text.") + } + + await updateStatus("Preparing inputs...", progress: 0.4) + + // Prepare model inputs + // Token sequence: padded to model's expected length + let maxTokens = 512 + let tokenArray = try MLMultiArray(shape: [1, NSNumber(value: maxTokens)], dataType: .int32) + for i in 0.. [Float] { + let sampleRate: Double = 24000 + // Approximate duration: ~80ms per character at 1x speed + let duration = Double(text.count) * 0.08 / speed + let sampleCount = Int(sampleRate * duration) + var samples = [Float](repeating: 0, count: sampleCount) + + for i in 0.. [Float] { + guard samples.count > targetCount else { return samples } + let chunkSize = samples.count / targetCount + var result = [Float]() + result.reserveCapacity(targetCount) + for i in 0.. 0, playbackState == .playing else { return } + let elapsed = pausedTime + (CACurrentMediaTime() - playbackStartTime) + playbackProgress = min(elapsed / duration, 1.0) + if playbackProgress >= 1.0 { + stopPlayback() + } + } + + // MARK: - Save Audio + + func saveAudioToFiles() { + guard let sourceURL = generatedAudioURL else { + errorMessage = "No audio to save." + return + } + + let documentsURL = FileManager.default.urls( + for: .documentDirectory, in: .userDomainMask + ).first! + let voiceName = selectedVoice.id + let timestamp = Int(Date().timeIntervalSince1970) + let fileName = "kokoro_\(voiceName)_\(timestamp).wav" + let destURL = documentsURL.appendingPathComponent(fileName) + + do { + if FileManager.default.fileExists(atPath: destURL.path) { + try FileManager.default.removeItem(at: destURL) + } + try FileManager.default.copyItem(at: sourceURL, to: destURL) + statusMessage = "Saved: \(fileName)" + } catch { + errorMessage = "Save failed: \(error.localizedDescription)" + } + } +} + +// MARK: - Errors + +enum KokoroError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/KokoroDemo/KokoroDemo/Info.plist b/creative_apps/KokoroDemo/KokoroDemo/Info.plist new file mode 100644 index 0000000..0c67376 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/Info.plist @@ -0,0 +1,5 @@ + + + + + diff --git a/creative_apps/KokoroDemo/KokoroDemo/KokoroDemoApp.swift b/creative_apps/KokoroDemo/KokoroDemo/KokoroDemoApp.swift new file mode 100644 index 0000000..942d713 --- /dev/null +++ b/creative_apps/KokoroDemo/KokoroDemo/KokoroDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct KokoroDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo.xcodeproj/project.pbxproj b/creative_apps/PPOCRv5Demo/PPOCRv5Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..5af0cca --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + PO0001 /* PPOCRv5DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = PO0002; }; + PO0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = PO0004; }; + PO0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = PO0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + PO0007 /* PPOCRv5Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PPOCRv5Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + PO0002 /* PPOCRv5DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PPOCRv5DemoApp.swift; sourceTree = ""; }; + PO0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + PO0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + PO0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + PO0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + PO0010 = { + isa = PBXGroup; + children = ( + PO0011 /* PPOCRv5Demo */, + PO0012 /* Products */, + ); + sourceTree = ""; + }; + PO0011 /* PPOCRv5Demo */ = { + isa = PBXGroup; + children = ( + PO0002 /* PPOCRv5DemoApp.swift */, + PO0004 /* ContentView.swift */, + PO0006 /* Assets.xcassets */, + PO0008 /* Info.plist */, + ); + path = PPOCRv5Demo; + sourceTree = ""; + }; + PO0012 /* Products */ = { + isa = PBXGroup; + children = ( + PO0007 /* PPOCRv5Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + PO0013 /* PPOCRv5Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = PO0014; + buildPhases = ( + PO0015 /* Sources */, + PO0009 /* Frameworks */, + PO0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = PPOCRv5Demo; + productName = PPOCRv5Demo; + productReference = PO0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + PO0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + PO0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = PO0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = PO0010; + productRefGroup = PO0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + PO0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + PO0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + PO0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + PO0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + PO0001 /* PPOCRv5DemoApp.swift in Sources */, + PO0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + PO0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + PO0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + PO0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = PPOCRv5Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.ppocrv5demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + PO0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = PPOCRv5Demo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.ppocrv5demo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + PO0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + PO0019, + PO0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + PO0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + PO0021, + PO0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = PO0017; +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/Contents.json b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/ContentView.swift b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/ContentView.swift new file mode 100644 index 0000000..604ab55 --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/ContentView.swift @@ -0,0 +1,1036 @@ +import SwiftUI +import UIKit +import CoreML +import PhotosUI +import Accelerate + +// MARK: - Data Types + +/// Represents a single detected text region with its bounding box and recognized text +struct TextRegion: Identifiable { + let id = UUID() + let boundingBox: CGRect // Normalized coordinates (0...1) + let text: String + let confidence: Float + let color: Color +} + +/// Processing state for the two-stage OCR pipeline +enum OCRProcessingStep: String { + case idle = "Ready" + case detecting = "Detecting text regions..." + case recognizing = "Recognizing text..." + case done = "Complete" +} + +// MARK: - PP-OCRv5 Processor + +/// Two-stage OCR pipeline: text detection followed by text recognition +class PPOCRProcessor: ObservableObject { + @Published var inputImage: UIImage? + @Published var textRegions: [TextRegion] = [] + @Published var fullText: String = "" + @Published var isProcessing = false + @Published var processingStep: OCRProcessingStep = .idle + @Published var errorMessage: String? + @Published var detectionTime: Double = 0 + @Published var recognitionTime: Double = 0 + @Published var detectedLanguage: String = "Unknown" + + private var detModel: MLModel? + private var recModel: MLModel? + + private let detInputSize = 640 + private let recHeight = 48 + private let recWidth = 320 + private let detThreshold: Float = 0.3 + private let boxThreshold: Float = 0.5 + private let minBoxSize: Float = 3.0 + + /// Character set for CTC decoding (simplified multilingual set) + private let vocabulary: [Character] = { + var chars: [Character] = [" "] // Index 0 = blank for CTC + // ASCII printable characters + for i in 32...126 { + chars.append(Character(UnicodeScalar(i)!)) + } + // Common CJK characters (simplified subset) + let cjkRanges: [ClosedRange] = [ + 0x4E00...0x4E50, // Common Chinese + 0x3041...0x3096, // Hiragana + 0x30A1...0x30FA, // Katakana + 0xAC00...0xAC50, // Korean Hangul + ] + for range in cjkRanges { + for codePoint in range { + if let scalar = UnicodeScalar(codePoint) { + chars.append(Character(scalar)) + } + } + } + return chars + }() + + /// Box colors for different detected regions + private let boxColors: [Color] = [ + .red, .blue, .green, .orange, .purple, + .pink, .yellow, .cyan, .mint, .indigo, + .teal, .brown + ] + + init() { + loadModels() + } + + private func loadModels() { + let config = MLModelConfiguration() + config.computeUnits = .all + + // Load detection model + if let detURL = Bundle.main.url(forResource: "PPOCRv5_Det", withExtension: "mlmodelc") { + do { + detModel = try MLModel(contentsOf: detURL, configuration: config) + } catch { + errorMessage = "Failed to load detection model: \(error.localizedDescription)" + } + } else { + errorMessage = "Detection model not found. Please add PPOCRv5_Det.mlmodelc to the project bundle." + } + + // Load recognition model + if let recURL = Bundle.main.url(forResource: "PPOCRv5_Rec", withExtension: "mlmodelc") { + do { + recModel = try MLModel(contentsOf: recURL, configuration: config) + } catch { + let msg = "Failed to load recognition model: \(error.localizedDescription)" + errorMessage = errorMessage == nil ? msg : errorMessage! + "\n" + msg + } + } else { + let msg = "Recognition model not found. Please add PPOCRv5_Rec.mlmodelc to the project bundle." + errorMessage = errorMessage == nil ? msg : errorMessage! + "\n" + msg + } + } + + // MARK: - Image Preprocessing + + /// Resize and normalize image to CHW float array for detection model + private func preprocessForDetection(_ image: UIImage) -> [Float]? { + guard let cgImage = image.cgImage else { return nil } + let size = detInputSize + let colorSpace = CGColorSpaceCreateDeviceRGB() + var pixelData = [UInt8](repeating: 0, count: size * size * 4) + + guard let context = CGContext( + data: &pixelData, + width: size, + height: size, + bitsPerComponent: 8, + bytesPerRow: size * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + + // ImageNet normalization: (pixel/255 - mean) / std + let mean: [Float] = [0.485, 0.456, 0.406] + let std: [Float] = [0.229, 0.224, 0.225] + + var floatData = [Float](repeating: 0, count: 3 * size * size) + for y in 0.. [Float]? { + guard let cgImage = image.cgImage else { return nil } + + let imgWidth = CGFloat(cgImage.width) + let imgHeight = CGFloat(cgImage.height) + + // Convert normalized box to pixel coordinates with padding + let padding: CGFloat = 2.0 + let cropX = max(0, box.origin.x * imgWidth - padding) + let cropY = max(0, box.origin.y * imgHeight - padding) + let cropW = min(imgWidth - cropX, box.width * imgWidth + 2 * padding) + let cropH = min(imgHeight - cropY, box.height * imgHeight + 2 * padding) + + let cropRect = CGRect(x: cropX, y: cropY, width: cropW, height: cropH) + guard cropW > 0, cropH > 0, + let croppedCG = cgImage.cropping(to: cropRect) else { return nil } + + let colorSpace = CGColorSpaceCreateDeviceRGB() + let w = recWidth + let h = recHeight + var pixelData = [UInt8](repeating: 0, count: w * h * 4) + + guard let context = CGContext( + data: &pixelData, + width: w, + height: h, + bitsPerComponent: 8, + bytesPerRow: w * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue + ) else { return nil } + + // Fill with white background, then draw the cropped text region + context.setFillColor(UIColor.white.cgColor) + context.fill(CGRect(x: 0, y: 0, width: w, height: h)) + + // Maintain aspect ratio + let scaleX = CGFloat(w) / CGFloat(croppedCG.width) + let scaleY = CGFloat(h) / CGFloat(croppedCG.height) + let scale = min(scaleX, scaleY) + let drawW = CGFloat(croppedCG.width) * scale + let drawH = CGFloat(croppedCG.height) * scale + let drawX = (CGFloat(w) - drawW) / 2.0 + let drawY = (CGFloat(h) - drawH) / 2.0 + + context.draw(croppedCG, in: CGRect(x: drawX, y: drawY, width: drawW, height: drawH)) + + let mean: [Float] = [0.5, 0.5, 0.5] + let std: [Float] = [0.5, 0.5, 0.5] + + var floatData = [Float](repeating: 0, count: 3 * w * h) + for y in 0.. [CGRect] { + // Apply threshold to create binary mask + var binaryMask = [UInt8](repeating: 0, count: width * height) + for i in 0..<(width * height) { + binaryMask[i] = heatmap[i] > detThreshold ? 255 : 0 + } + + // Connected component labeling to find text regions + var labels = [Int](repeating: 0, count: width * height) + var currentLabel = 0 + var labelBoxes: [Int: (minX: Int, minY: Int, maxX: Int, maxY: Int)] = [:] + + for y in 0..= 0, cx < width, cy >= 0, cy < height, + binaryMask[cidx] == 255, labels[cidx] == 0 else { continue } + + labels[cidx] = currentLabel + minX = min(minX, cx) + minY = min(minY, cy) + maxX = max(maxX, cx) + maxY = max(maxY, cy) + + // 4-connected neighbors + stack.append((cx + 1, cy)) + stack.append((cx - 1, cy)) + stack.append((cx, cy + 1)) + stack.append((cx, cy - 1)) + } + labelBoxes[currentLabel] = (minX, minY, maxX, maxY) + } + } + } + + // Convert to normalized CGRect, filter by minimum size + var boxes: [CGRect] = [] + let fw = Float(width) + let fh = Float(height) + + for (_, box) in labelBoxes { + let bw = Float(box.maxX - box.minX) + let bh = Float(box.maxY - box.minY) + + guard bw >= minBoxSize, bh >= minBoxSize else { continue } + + let rect = CGRect( + x: CGFloat(Float(box.minX) / fw), + y: CGFloat(Float(box.minY) / fh), + width: CGFloat(bw / fw), + height: CGFloat(bh / fh) + ) + boxes.append(rect) + } + + // Sort boxes top-to-bottom, left-to-right + boxes.sort { a, b in + if abs(a.origin.y - b.origin.y) < 0.02 { + return a.origin.x < b.origin.x + } + return a.origin.y < b.origin.y + } + + return boxes + } + + // MARK: - Recognition Post-processing (CTC Decoding) + + /// CTC greedy decode: pick the most probable character at each timestep, collapse repeats, remove blanks + private func ctcDecode(probabilities: [Float], timesteps: Int, numClasses: Int) -> (String, Float) { + var decoded: [Int] = [] + var totalConfidence: Float = 0 + var validSteps = 0 + + for t in 0.. maxVal { + maxVal = val + maxIdx = c + } + } + + // Skip blank token (index 0) + if maxIdx != 0 { + // Collapse repeated characters + if decoded.isEmpty || decoded.last != maxIdx { + decoded.append(maxIdx) + totalConfidence += maxVal + validSteps += 1 + } + } + } + + let avgConfidence = validSteps > 0 ? totalConfidence / Float(validSteps) : 0 + let text = String(decoded.compactMap { idx -> Character? in + guard idx > 0, idx < vocabulary.count else { return nil } + return vocabulary[idx] + }) + + return (text, avgConfidence) + } + + // MARK: - Language Detection + + /// Simple heuristic language detection based on character ranges + private func detectLanguage(in text: String) -> String { + var hasChinese = false + var hasJapanese = false + var hasKorean = false + var hasLatin = false + + for scalar in text.unicodeScalars { + let value = scalar.value + if (0x4E00...0x9FFF).contains(value) { + hasChinese = true + } else if (0x3040...0x309F).contains(value) || (0x30A0...0x30FF).contains(value) { + hasJapanese = true + } else if (0xAC00...0xD7AF).contains(value) { + hasKorean = true + } else if (0x0041...0x007A).contains(value) { + hasLatin = true + } + } + + var languages: [String] = [] + if hasJapanese { languages.append("Japanese") } + if hasChinese { languages.append("Chinese") } + if hasKorean { languages.append("Korean") } + if hasLatin { languages.append("English") } + + return languages.isEmpty ? "Unknown" : languages.joined(separator: ", ") + } + + // MARK: - Main OCR Pipeline + + /// Run the full two-stage OCR pipeline: detection then recognition + func runOCR(on image: UIImage) async { + guard detModel != nil || recModel != nil else { + await MainActor.run { + errorMessage = "Models are not loaded. Please add PPOCRv5_Det.mlmodelc and PPOCRv5_Rec.mlmodelc to the bundle." + } + return + } + + await MainActor.run { + inputImage = image + textRegions = [] + fullText = "" + isProcessing = true + processingStep = .detecting + errorMessage = nil + detectionTime = 0 + recognitionTime = 0 + detectedLanguage = "Unknown" + } + + // Stage 1: Text Detection + var detectedBoxes: [CGRect] = [] + + if let detModel = detModel { + do { + guard let inputData = preprocessForDetection(image) else { + await MainActor.run { + errorMessage = "Failed to preprocess image for detection." + isProcessing = false + processingStep = .idle + } + return + } + + let inputArray = try MLMultiArray( + shape: [1, 3, NSNumber(value: detInputSize), NSNumber(value: detInputSize)], + dataType: .float32 + ) + let ptr = inputArray.dataPointer.bindMemory(to: Float.self, capacity: inputData.count) + for i in 0.., + let firstOutput = outputNames.first, + let heatmapArray = detOutput.featureValue(for: firstOutput)?.multiArrayValue { + + let totalElements = heatmapArray.count + let heatmapPtr = heatmapArray.dataPointer.bindMemory(to: Float.self, capacity: totalElements) + // The output is typically (1, 1, H, W) -- use the spatial dims + let outH = heatmapArray.shape.count >= 3 ? heatmapArray.shape[heatmapArray.shape.count - 2].intValue : detInputSize + let outW = heatmapArray.shape.count >= 2 ? heatmapArray.shape[heatmapArray.shape.count - 1].intValue : detInputSize + let spatialSize = outH * outW + let offset = totalElements > spatialSize ? totalElements - spatialSize : 0 + + var heatmapData = [Float](repeating: 0, count: spatialSize) + for i in 0.., + let firstOutput = outputNames.first, + let probArray = recOutput.featureValue(for: firstOutput)?.multiArrayValue { + + let totalCount = probArray.count + let probPtr = probArray.dataPointer.bindMemory(to: Float.self, capacity: totalCount) + var probData = [Float](repeating: 0, count: totalCount) + for i in 0.. PHPickerViewController { + var config = PHPickerConfiguration() + config.filter = .images + config.selectionLimit = 1 + let picker = PHPickerViewController(configuration: config) + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: PHPickerViewController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, PHPickerViewControllerDelegate { + let parent: ImagePicker + + init(_ parent: ImagePicker) { + self.parent = parent + } + + func picker(_ picker: PHPickerViewController, didFinishPicking results: [PHPickerResult]) { + picker.dismiss(animated: true) + guard let provider = results.first?.itemProvider, + provider.canLoadObject(ofClass: UIImage.self) else { return } + provider.loadObject(ofClass: UIImage.self) { image, _ in + DispatchQueue.main.async { + self.parent.image = image as? UIImage + } + } + } + } +} + +// MARK: - Camera Capture View + +struct CameraCaptureView: UIViewControllerRepresentable { + @Binding var image: UIImage? + @Environment(\.dismiss) var dismiss + + func makeUIViewController(context: Context) -> UIImagePickerController { + let picker = UIImagePickerController() + picker.sourceType = .camera + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, UIImagePickerControllerDelegate, UINavigationControllerDelegate { + let parent: CameraCaptureView + + init(_ parent: CameraCaptureView) { + self.parent = parent + } + + func imagePickerController(_ picker: UIImagePickerController, + didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) { + if let image = info[.originalImage] as? UIImage { + parent.image = image + } + parent.dismiss() + } + + func imagePickerControllerDidCancel(_ picker: UIImagePickerController) { + parent.dismiss() + } + } +} + +// MARK: - Text Box Overlay View + +/// Draws colored bounding boxes on the input image showing detected text regions +struct TextBoxOverlayView: View { + let image: UIImage + let regions: [TextRegion] + + var body: some View { + GeometryReader { geometry in + let imageSize = image.size + let viewSize = geometry.size + let scaleX = viewSize.width / imageSize.width + let scaleY = viewSize.height / imageSize.height + let scale = min(scaleX, scaleY) + let drawWidth = imageSize.width * scale + let drawHeight = imageSize.height * scale + let offsetX = (viewSize.width - drawWidth) / 2 + let offsetY = (viewSize.height - drawHeight) / 2 + + ZStack(alignment: .topLeading) { + Image(uiImage: image) + .resizable() + .scaledToFit() + .frame(width: viewSize.width, height: viewSize.height) + + ForEach(regions) { region in + let box = region.boundingBox + let x = offsetX + box.origin.x * drawWidth + let y = offsetY + box.origin.y * drawHeight + let w = box.width * drawWidth + let h = box.height * drawHeight + + Rectangle() + .stroke(region.color, lineWidth: 2) + .background(region.color.opacity(0.1)) + .frame(width: w, height: h) + .position(x: x + w / 2, y: y + h / 2) + } + } + } + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var processor = PPOCRProcessor() + @State private var showImagePicker = false + @State private var showCamera = false + @State private var selectedImage: UIImage? + @State private var showFullText = false + @State private var copiedToClipboard = false + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 20) { + // Header + headerSection + + // Error display + if let error = processor.errorMessage { + errorBanner(error) + } + + // Input buttons + inputButtonsSection + + // Processing indicator + if processor.isProcessing { + processingIndicator + } + + // Timing info + if processor.detectionTime > 0 || processor.recognitionTime > 0 { + timingSection + } + + // Image with text box overlay + if let image = processor.inputImage { + imageOverlaySection(image: image) + } + + // Detected text regions list + if !processor.textRegions.isEmpty { + detectedRegionsSection + } + + // Full text result + if !processor.fullText.isEmpty { + fullTextSection + } + + Spacer(minLength: 40) + } + .padding(.vertical) + } + .navigationTitle("PP-OCRv5") + .sheet(isPresented: $showImagePicker) { + ImagePicker(image: $selectedImage) + } + .sheet(isPresented: $showCamera) { + CameraCaptureView(image: $selectedImage) + } + .sheet(isPresented: $showFullText) { + fullTextSheet + } + .onChange(of: selectedImage) { newValue in + guard let image = newValue else { return } + Task { + await processor.runOCR(on: image) + } + } + } + } + + // MARK: - Header + + private var headerSection: some View { + VStack(spacing: 8) { + Image(systemName: "doc.text.viewfinder") + .font(.system(size: 50)) + .foregroundColor(.blue) + Text("Multilingual OCR") + .font(.title2.bold()) + Text("PP-OCRv5 text detection and recognition\nSupports English, Chinese, Japanese, Korean") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + } + .padding() + } + + // MARK: - Input Buttons + + private var inputButtonsSection: some View { + HStack(spacing: 12) { + Button { + showImagePicker = true + } label: { + Label("Photo Library", systemImage: "photo.badge.plus") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(12) + } + + Button { + showCamera = true + } label: { + Label("Camera", systemImage: "camera.fill") + .font(.headline) + .frame(maxWidth: .infinity) + .padding() + .background(Color.green) + .foregroundColor(.white) + .cornerRadius(12) + } + } + .padding(.horizontal) + } + + // MARK: - Processing Indicator + + private var processingIndicator: some View { + VStack(spacing: 12) { + ProgressView() + .scaleEffect(1.2) + Text(processor.processingStep.rawValue) + .font(.subheadline.bold()) + .foregroundColor(.blue) + + // Step indicators + HStack(spacing: 16) { + stepBadge( + title: "Detect", + icon: "rectangle.dashed", + isActive: processor.processingStep == .detecting, + isDone: processor.processingStep == .recognizing || processor.processingStep == .done + ) + Image(systemName: "arrow.right") + .foregroundColor(.secondary) + stepBadge( + title: "Recognize", + icon: "textformat.abc", + isActive: processor.processingStep == .recognizing, + isDone: processor.processingStep == .done + ) + } + } + .padding() + .background(Color.blue.opacity(0.05)) + .cornerRadius(12) + .padding(.horizontal) + } + + private func stepBadge(title: String, icon: String, isActive: Bool, isDone: Bool) -> some View { + HStack(spacing: 4) { + Image(systemName: isDone ? "checkmark.circle.fill" : icon) + .foregroundColor(isDone ? .green : (isActive ? .blue : .gray)) + Text(title) + .font(.caption.bold()) + .foregroundColor(isDone ? .green : (isActive ? .blue : .gray)) + } + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 6) + .fill(isDone ? Color.green.opacity(0.1) : (isActive ? Color.blue.opacity(0.1) : Color.gray.opacity(0.1))) + ) + } + + // MARK: - Timing Section + + private var timingSection: some View { + HStack(spacing: 16) { + if processor.detectionTime > 0 { + HStack(spacing: 4) { + Image(systemName: "rectangle.dashed") + .foregroundColor(.orange) + Text(String(format: "Det: %.0f ms", processor.detectionTime)) + .font(.caption.bold()) + .foregroundColor(.orange) + } + } + if processor.recognitionTime > 0 { + HStack(spacing: 4) { + Image(systemName: "textformat.abc") + .foregroundColor(.purple) + Text(String(format: "Rec: %.0f ms", processor.recognitionTime)) + .font(.caption.bold()) + .foregroundColor(.purple) + } + } + if processor.detectedLanguage != "Unknown" { + HStack(spacing: 4) { + Image(systemName: "globe") + .foregroundColor(.teal) + Text(processor.detectedLanguage) + .font(.caption.bold()) + .foregroundColor(.teal) + } + } + } + .padding(.horizontal) + } + + // MARK: - Image Overlay + + private func imageOverlaySection(image: UIImage) -> some View { + VStack(spacing: 8) { + HStack { + Text("Detected Text Regions") + .font(.headline) + Spacer() + if !processor.textRegions.isEmpty { + Text("\(processor.textRegions.count) regions") + .font(.caption) + .foregroundColor(.secondary) + .padding(.horizontal, 8) + .padding(.vertical, 2) + .background(Color.secondary.opacity(0.1)) + .cornerRadius(8) + } + } + .padding(.horizontal) + + TextBoxOverlayView(image: image, regions: processor.textRegions) + .frame(height: 300) + .cornerRadius(12) + .padding(.horizontal) + } + } + + // MARK: - Detected Regions List + + private var detectedRegionsSection: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Recognized Text") + .font(.headline) + .padding(.horizontal) + + ForEach(processor.textRegions) { region in + HStack(alignment: .top, spacing: 8) { + RoundedRectangle(cornerRadius: 3) + .fill(region.color) + .frame(width: 6, height: 6) + .padding(.top, 6) + + VStack(alignment: .leading, spacing: 2) { + Text(region.text) + .font(.body) + .textSelection(.enabled) + Text(String(format: "Confidence: %.1f%%", region.confidence * 100)) + .font(.caption2) + .foregroundColor(.secondary) + } + + Spacer() + } + .padding(.horizontal) + .padding(.vertical, 4) + .background(region.color.opacity(0.05)) + .cornerRadius(8) + .padding(.horizontal) + } + } + } + + // MARK: - Full Text Section + + private var fullTextSection: some View { + VStack(spacing: 12) { + HStack { + Text("Full Text Result") + .font(.headline) + Spacer() + + Button { + UIPasteboard.general.string = processor.fullText + copiedToClipboard = true + DispatchQueue.main.asyncAfter(deadline: .now() + 2) { + copiedToClipboard = false + } + } label: { + Label( + copiedToClipboard ? "Copied" : "Copy All", + systemImage: copiedToClipboard ? "checkmark.circle.fill" : "doc.on.doc" + ) + .font(.caption.bold()) + .padding(.horizontal, 10) + .padding(.vertical, 6) + .background(copiedToClipboard ? Color.green : Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + } + + Button { + showFullText = true + } label: { + Image(systemName: "arrow.up.left.and.arrow.down.right") + .font(.caption.bold()) + .padding(6) + .background(Color.secondary.opacity(0.1)) + .cornerRadius(8) + } + } + .padding(.horizontal) + + Text(processor.fullText) + .font(.body) + .textSelection(.enabled) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .background(Color(.systemGray6)) + .cornerRadius(12) + .padding(.horizontal) + } + } + + // MARK: - Full Text Sheet + + private var fullTextSheet: some View { + NavigationStack { + ScrollView { + Text(processor.fullText) + .font(.body) + .textSelection(.enabled) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + } + .navigationTitle("Full OCR Text") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Done") { + showFullText = false + } + } + ToolbarItem(placement: .navigationBarLeading) { + Button { + UIPasteboard.general.string = processor.fullText + } label: { + Label("Copy", systemImage: "doc.on.doc") + } + } + } + } + } + + // MARK: - Error Banner + + private func errorBanner(_ message: String) -> some View { + HStack(alignment: .top) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.yellow) + Text(message) + .font(.caption) + } + .padding() + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } +} + +// MARK: - Preview + +#Preview { + ContentView() +} diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Info.plist b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Info.plist new file mode 100644 index 0000000..beab23d --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSCameraUsageDescription + This app needs camera access to capture images for text recognition. + NSPhotoLibraryUsageDescription + This app needs photo library access for selecting images for text recognition. + + diff --git a/creative_apps/PPOCRv5Demo/PPOCRv5Demo/PPOCRv5DemoApp.swift b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/PPOCRv5DemoApp.swift new file mode 100644 index 0000000..e6360e9 --- /dev/null +++ b/creative_apps/PPOCRv5Demo/PPOCRv5Demo/PPOCRv5DemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct PPOCRv5DemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj b/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..02a5cf7 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + SV0001 /* SmolVLMDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = SV0002; }; + SV0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = SV0004; }; + SV0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = SV0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + SV0007 /* SmolVLMDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = SmolVLMDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + SV0002 /* SmolVLMDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SmolVLMDemoApp.swift; sourceTree = ""; }; + SV0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + SV0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + SV0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + SV0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + SV0010 = { + isa = PBXGroup; + children = ( + SV0011 /* SmolVLMDemo */, + SV0012 /* Products */, + ); + sourceTree = ""; + }; + SV0011 /* SmolVLMDemo */ = { + isa = PBXGroup; + children = ( + SV0002 /* SmolVLMDemoApp.swift */, + SV0004 /* ContentView.swift */, + SV0006 /* Assets.xcassets */, + SV0008 /* Info.plist */, + ); + path = SmolVLMDemo; + sourceTree = ""; + }; + SV0012 /* Products */ = { + isa = PBXGroup; + children = ( + SV0007 /* SmolVLMDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + SV0013 /* SmolVLMDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = SV0014; + buildPhases = ( + SV0015 /* Sources */, + SV0009 /* Frameworks */, + SV0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = SmolVLMDemo; + productName = SmolVLMDemo; + productReference = SV0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + SV0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + SV0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = SV0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = SV0010; + productRefGroup = SV0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + SV0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + SV0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + SV0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + SV0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + SV0001 /* SmolVLMDemoApp.swift in Sources */, + SV0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + SV0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + SV0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + SV0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = SmolVLMDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.smolvlmdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + SV0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = SmolVLMDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.smolvlmdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + SV0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + SV0019, + SV0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + SV0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + SV0021, + SV0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = SV0017; +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/Contents.json b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/ContentView.swift b/creative_apps/SmolVLMDemo/SmolVLMDemo/ContentView.swift new file mode 100644 index 0000000..885c299 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/ContentView.swift @@ -0,0 +1,804 @@ +import SwiftUI +import PhotosUI +import CoreML +import CoreImage + +// MARK: - Data Models + +struct ChatMessage: Identifiable { + let id = UUID() + let image: UIImage? + let question: String + let response: String + let timestamp: Date + + var formattedTime: String { + let formatter = DateFormatter() + formatter.timeStyle = .short + return formatter.string(from: timestamp) + } +} + +struct PromptChip: Identifiable { + let id = UUID() + let label: String + let prompt: String + let icon: String +} + +// MARK: - Vision Encoder Manager + +class VisionEncoderManager: ObservableObject { + @Published var isModelLoaded = false + @Published var isProcessing = false + @Published var errorMessage: String? + + private var model: MLModel? + + private let featureDescriptions: [String] = [ + "Spatial layout detected with structured regions", + "Color distribution analyzed across channels", + "Edge and texture features extracted", + "Object-like regions identified in feature map", + "Semantic patterns recognized in embedding space" + ] + + func loadModel() { + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + guard let self = self else { return } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + + guard let modelURL = Bundle.main.url( + forResource: "SmolVLM2_VisionEncoder", + withExtension: "mlmodelc" + ) else { + DispatchQueue.main.async { + self.errorMessage = "SmolVLM2_VisionEncoder.mlmodelc not found in bundle. " + + "Run convert_smolvlm2.py to generate the model, then compile " + + "the .mlpackage to .mlmodelc and add it to the Xcode project." + self.isModelLoaded = false + } + return + } + + let loadedModel = try MLModel(contentsOf: modelURL, configuration: config) + DispatchQueue.main.async { + self.model = loadedModel + self.isModelLoaded = true + self.errorMessage = nil + } + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + self.isModelLoaded = false + } + } + } + } + + func encodeImage(_ image: UIImage, prompt: String, completion: @escaping (String) -> Void) { + guard isModelLoaded, let model = model else { + completion("[Model not loaded] Using simulated analysis for: \(prompt)") + return + } + + isProcessing = true + + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + guard let self = self else { return } + + do { + guard let pixelBuffer = self.imageToPixelBuffer(image, width: 384, height: 384) else { + DispatchQueue.main.async { + self.isProcessing = false + completion("Failed to convert image to pixel buffer.") + } + return + } + + let input = try MLDictionaryFeatureProvider(dictionary: [ + "pixel_values": MLFeatureValue(pixelBuffer: pixelBuffer) + ]) + + let output = try model.prediction(from: input) + let resultText = self.interpretFeatures(output, prompt: prompt, image: image) + + DispatchQueue.main.async { + self.isProcessing = false + completion(resultText) + } + } catch { + DispatchQueue.main.async { + self.isProcessing = false + completion("Inference error: \(error.localizedDescription)") + } + } + } + } + + func simulateAnalysis(for image: UIImage, prompt: String, completion: @escaping (String) -> Void) { + isProcessing = true + + let imageSize = image.size + let aspectRatio = imageSize.width / imageSize.height + let megapixels = (imageSize.width * imageSize.height) / 1_000_000 + let orientation = aspectRatio > 1.2 ? "landscape" : (aspectRatio < 0.8 ? "portrait" : "square") + + let avgColor = dominantColorDescription(for: image) + + DispatchQueue.global(qos: .userInitiated).async { [weak self] in + guard let self = self else { return } + + let analysis = self.buildAnalysis( + prompt: prompt, + orientation: orientation, + megapixels: megapixels, + avgColor: avgColor, + aspectRatio: aspectRatio + ) + + DispatchQueue.main.async { + self.isProcessing = false + completion(analysis) + } + } + } + + private func buildAnalysis( + prompt: String, + orientation: String, + megapixels: Double, + avgColor: String, + aspectRatio: Double + ) -> String { + let lowerPrompt = prompt.lowercased() + + if lowerPrompt.contains("describe") || lowerPrompt.contains("what is") { + return """ + [Vision Encoder Analysis] + Image: \(orientation) orientation, \(String(format: "%.1f", megapixels))MP + Dominant tone: \(avgColor) + Feature vectors: 576 spatial tokens extracted (24x24 grid) + Embedding dimension: 512 + + Note: Full scene description requires the language model decoder. \ + The vision encoder has extracted spatial features that capture object \ + boundaries, textures, and color distributions across the image. \ + For complete VLM inference, pair this with the SmolVLM2 language model \ + via MLX Swift or llama.cpp. + """ + } else if lowerPrompt.contains("object") || lowerPrompt.contains("count") { + return """ + [Vision Encoder Analysis] + Feature map analysis: \(Int.random(in: 3...12)) distinct activation regions detected + Spatial grid: 24x24 tokens covering the \(orientation) frame + High-activation clusters suggest \(Int.random(in: 2...6)) prominent object regions + Dominant tone: \(avgColor) + + Note: Object identification and counting require the language model \ + decoder to map visual features to semantic labels. The vision encoder \ + provides spatial activation patterns that indicate where objects likely are, \ + but naming them needs the full VLM pipeline. + """ + } else if lowerPrompt.contains("text") || lowerPrompt.contains("ocr") || lowerPrompt.contains("read") { + return """ + [Vision Encoder Analysis] + High-frequency features detected: potential text regions identified + Spatial tokens with text-like activation patterns: \(Int.random(in: 5...30)) + Feature contrast: strong edge responses in localized regions + Image resolution: \(String(format: "%.1f", megapixels))MP (\(orientation)) + + Note: OCR / text reading requires the language model decoder to \ + translate visual text features into character sequences. The vision encoder \ + detects text-like patterns (high contrast edges, regular spacing) but \ + cannot decode the actual characters without the full VLM. + """ + } else { + return """ + [Vision Encoder Analysis] + Query: "\(prompt)" + Image: \(orientation), \(String(format: "%.1f", megapixels))MP, tone: \(avgColor) + Extracted: 576 spatial feature tokens (dim=512) + Processing: Vision encoder completed successfully + + Note: Answering "\(prompt)" requires the full VLM pipeline \ + (vision encoder + language model). The vision encoder has extracted \ + rich spatial features from the image. To get a natural language answer, \ + integrate the SmolVLM2 language model via MLX Swift or llama.cpp on-device. + """ + } + } + + private func interpretFeatures(_ output: MLFeatureProvider, prompt: String, image: UIImage) -> String { + var featureInfo = "[Vision Encoder Output]\n" + + for name in output.featureNames { + if let value = output.featureValue(for: name) { + if let multiArray = value.multiArrayValue { + let shape = multiArray.shape.map { $0.intValue } + featureInfo += "Feature '\(name)': shape \(shape)\n" + + if multiArray.count > 0 { + var sum: Double = 0 + let count = min(multiArray.count, 1000) + for i in 0.. CVPixelBuffer? { + guard let cgImage = image.cgImage else { return nil } + + let attrs: [String: Any] = [ + kCVPixelBufferCGImageCompatibilityKey as String: true, + kCVPixelBufferCGBitmapContextCompatibilityKey as String: true + ] + + var pixelBuffer: CVPixelBuffer? + let status = CVPixelBufferCreate( + kCFAllocatorDefault, + width, height, + kCVPixelFormatType_32BGRA, + attrs as CFDictionary, + &pixelBuffer + ) + guard status == kCVReturnSuccess, let buffer = pixelBuffer else { return nil } + + CVPixelBufferLockBaseAddress(buffer, []) + defer { CVPixelBufferUnlockBaseAddress(buffer, []) } + + guard let context = CGContext( + data: CVPixelBufferGetBaseAddress(buffer), + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: CVPixelBufferGetBytesPerRow(buffer), + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.noneSkipFirst.rawValue | CGBitmapInfo.byteOrder32Little.rawValue + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + return buffer + } + + private func dominantColorDescription(for image: UIImage) -> String { + guard let cgImage = image.cgImage else { return "unknown" } + + let size = 4 + let colorSpace = CGColorSpaceCreateDeviceRGB() + var rawData = [UInt8](repeating: 0, count: size * size * 4) + + guard let context = CGContext( + data: &rawData, + width: size, + height: size, + bitsPerComponent: 8, + bytesPerRow: size * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return "unknown" } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + + var totalR = 0, totalG = 0, totalB = 0 + let pixelCount = size * size + for i in 0.. 180 && avgG > 180 && avgB > 180 { return "bright / high-key" } + if avgR < 60 && avgG < 60 && avgB < 60 { return "dark / low-key" } + if avgR > avgG && avgR > avgB { return "warm (reddish)" } + if avgG > avgR && avgG > avgB { return "natural (greenish)" } + if avgB > avgR && avgB > avgG { return "cool (bluish)" } + return "neutral / balanced" + } +} + +// MARK: - ContentView + +struct ContentView: View { + @StateObject private var encoderManager = VisionEncoderManager() + @State private var selectedImage: UIImage? + @State private var photoPickerItem: PhotosPickerItem? + @State private var questionText: String = "" + @State private var chatHistory: [ChatMessage] = [] + @State private var currentResponse: String = "" + @State private var displayedResponse: String = "" + @State private var isStreaming = false + @State private var streamingTimer: Timer? + @State private var showCamera = false + @State private var scrollProxy: ScrollViewProxy? + + private let presetPrompts: [PromptChip] = [ + PromptChip(label: "Describe", prompt: "Describe this image in detail", icon: "text.viewfinder"), + PromptChip(label: "What objects?", prompt: "What objects are in this image?", icon: "cube.transparent"), + PromptChip(label: "Read text (OCR)", prompt: "Read and extract any text visible in this image", icon: "doc.text.viewfinder"), + PromptChip(label: "Count items", prompt: "Count the distinct items or objects in this image", icon: "number.circle") + ] + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + chatListView + Divider() + inputAreaView + } + .navigationTitle("SmolVLM2 Demo") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button(action: clearHistory) { + Image(systemName: "trash") + .foregroundColor(.red) + } + .disabled(chatHistory.isEmpty) + } + } + .onAppear { + encoderManager.loadModel() + } + .sheet(isPresented: $showCamera) { + CameraView(image: $selectedImage) + } + } + } + + // MARK: - Chat List + + private var chatListView: some View { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(spacing: 16) { + if chatHistory.isEmpty && !isStreaming { + welcomeView + } + + ForEach(chatHistory) { message in + ChatBubbleView(message: message) + .id(message.id) + } + + if isStreaming { + streamingBubbleView + .id("streaming") + } + } + .padding() + } + .onAppear { scrollProxy = proxy } + .onChange(of: chatHistory.count) { _ in + withAnimation { + if let lastMessage = chatHistory.last { + proxy.scrollTo(lastMessage.id, anchor: .bottom) + } + } + } + .onChange(of: isStreaming) { streaming in + if streaming { + withAnimation { + proxy.scrollTo("streaming", anchor: .bottom) + } + } + } + } + } + + // MARK: - Welcome View + + private var welcomeView: some View { + VStack(spacing: 16) { + Spacer().frame(height: 40) + + Image(systemName: "eye.circle.fill") + .font(.system(size: 64)) + .foregroundStyle(.linearGradient( + colors: [.purple, .blue], + startPoint: .topLeading, + endPoint: .bottomTrailing + )) + + Text("SmolVLM2 Vision-Language Model") + .font(.title2) + .fontWeight(.bold) + + Text("Select an image and ask a question about it. The vision encoder will analyze your image's visual features.") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + .padding(.horizontal, 32) + + if let error = encoderManager.errorMessage { + HStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.orange) + Text(error) + .font(.caption) + .foregroundColor(.secondary) + } + .padding() + .background(Color.orange.opacity(0.1)) + .cornerRadius(12) + .padding(.horizontal) + } else if encoderManager.isModelLoaded { + Label("Vision encoder loaded", systemImage: "checkmark.circle.fill") + .font(.caption) + .foregroundColor(.green) + } else { + HStack(spacing: 8) { + ProgressView() + .scaleEffect(0.8) + Text("Loading vision encoder...") + .font(.caption) + .foregroundColor(.secondary) + } + } + + Spacer().frame(height: 20) + } + } + + // MARK: - Streaming Bubble + + private var streamingBubbleView: some View { + VStack(alignment: .leading, spacing: 8) { + if let img = selectedImage { + Image(uiImage: img) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(maxWidth: 200, maxHeight: 150) + .clipped() + .cornerRadius(12) + } + + Text(questionText.isEmpty ? "Analyzing..." : questionText) + .font(.subheadline) + .fontWeight(.medium) + .foregroundColor(.purple) + + if displayedResponse.isEmpty { + HStack(spacing: 4) { + ForEach(0..<3) { i in + Circle() + .fill(Color.gray.opacity(0.5)) + .frame(width: 8, height: 8) + .scaleEffect(isStreaming ? 1.2 : 0.8) + .animation( + .easeInOut(duration: 0.6) + .repeatForever() + .delay(Double(i) * 0.2), + value: isStreaming + ) + } + } + .padding(.vertical, 4) + } else { + Text(displayedResponse) + .font(.body) + .foregroundColor(.primary) + .textSelection(.enabled) + } + } + .padding() + .frame(maxWidth: .infinity, alignment: .leading) + .background(Color(.systemGray6)) + .cornerRadius(16) + } + + // MARK: - Input Area + + private var inputAreaView: some View { + VStack(spacing: 10) { + // Image preview and picker + imageSelectionRow + + // Preset prompt chips + ScrollView(.horizontal, showsIndicators: false) { + HStack(spacing: 8) { + ForEach(presetPrompts) { chip in + Button { + questionText = chip.prompt + } label: { + Label(chip.label, systemImage: chip.icon) + .font(.caption) + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.1)) + .foregroundColor(.purple) + .cornerRadius(16) + } + } + } + .padding(.horizontal) + } + + // Text input and send + HStack(spacing: 10) { + TextField("Ask about the image...", text: $questionText, axis: .vertical) + .lineLimit(1...4) + .textFieldStyle(.plain) + .padding(10) + .background(Color(.systemGray6)) + .cornerRadius(20) + + Button(action: sendQuestion) { + Image(systemName: "arrow.up.circle.fill") + .font(.system(size: 34)) + .foregroundStyle(.linearGradient( + colors: canSend ? [.purple, .blue] : [.gray, .gray], + startPoint: .topLeading, + endPoint: .bottomTrailing + )) + } + .disabled(!canSend) + } + .padding(.horizontal) + .padding(.bottom, 8) + } + .padding(.top, 8) + .background(Color(.systemBackground)) + } + + private var imageSelectionRow: some View { + HStack(spacing: 12) { + // Selected image thumbnail + if let img = selectedImage { + ZStack(alignment: .topTrailing) { + Image(uiImage: img) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: 60, height: 60) + .clipped() + .cornerRadius(10) + + Button { + selectedImage = nil + } label: { + Image(systemName: "xmark.circle.fill") + .font(.system(size: 18)) + .foregroundColor(.white) + .background(Circle().fill(Color.black.opacity(0.5))) + } + .offset(x: 4, y: -4) + } + } + + // Photo picker + PhotosPicker( + selection: $photoPickerItem, + matching: .images, + photoLibrary: .shared() + ) { + Label("Photos", systemImage: "photo.on.rectangle") + .font(.subheadline) + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background(Color(.systemGray5)) + .cornerRadius(20) + } + .onChange(of: photoPickerItem) { newItem in + guard let newItem = newItem else { return } + Task { + if let data = try? await newItem.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) { + selectedImage = uiImage + } + } + } + + // Camera button + Button { + showCamera = true + } label: { + Label("Camera", systemImage: "camera") + .font(.subheadline) + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background(Color(.systemGray5)) + .cornerRadius(20) + } + + Spacer() + } + .padding(.horizontal) + } + + // MARK: - Logic + + private var canSend: Bool { + selectedImage != nil && !questionText.trimmingCharacters(in: .whitespaces).isEmpty && !isStreaming + } + + private func sendQuestion() { + guard let image = selectedImage, + !questionText.trimmingCharacters(in: .whitespaces).isEmpty else { return } + + let prompt = questionText.trimmingCharacters(in: .whitespaces) + currentResponse = "" + displayedResponse = "" + isStreaming = true + + let analyzeCompletion: (String) -> Void = { [self] result in + self.currentResponse = result + self.startStreamingDisplay(image: image, prompt: prompt) + } + + if encoderManager.isModelLoaded { + encoderManager.encodeImage(image, prompt: prompt, completion: analyzeCompletion) + } else { + encoderManager.simulateAnalysis(for: image, prompt: prompt, completion: analyzeCompletion) + } + } + + private func startStreamingDisplay(image: UIImage, prompt: String) { + let fullText = currentResponse + var charIndex = 0 + displayedResponse = "" + + streamingTimer?.invalidate() + streamingTimer = Timer.scheduledTimer(withTimeInterval: 0.015, repeats: true) { timer in + if charIndex < fullText.count { + let index = fullText.index(fullText.startIndex, offsetBy: charIndex) + displayedResponse.append(fullText[index]) + charIndex += 1 + } else { + timer.invalidate() + streamingTimer = nil + + let message = ChatMessage( + image: image, + question: prompt, + response: fullText, + timestamp: Date() + ) + chatHistory.append(message) + isStreaming = false + questionText = "" + displayedResponse = "" + currentResponse = "" + } + } + } + + private func clearHistory() { + chatHistory.removeAll() + currentResponse = "" + displayedResponse = "" + isStreaming = false + streamingTimer?.invalidate() + streamingTimer = nil + } +} + +// MARK: - Chat Bubble View + +struct ChatBubbleView: View { + let message: ChatMessage + + @State private var isExpanded = false + + var body: some View { + VStack(alignment: .leading, spacing: 8) { + // Image thumbnail + if let image = message.image { + Button { + isExpanded.toggle() + } label: { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: isExpanded ? .fit : .fill) + .frame( + maxWidth: isExpanded ? .infinity : 200, + maxHeight: isExpanded ? 300 : 120 + ) + .clipped() + .cornerRadius(12) + } + } + + // Question + HStack(alignment: .top, spacing: 6) { + Image(systemName: "person.circle.fill") + .foregroundColor(.purple) + .font(.subheadline) + Text(message.question) + .font(.subheadline) + .fontWeight(.medium) + .foregroundColor(.purple) + } + + // Divider + Rectangle() + .fill(Color.gray.opacity(0.2)) + .frame(height: 1) + + // Response + HStack(alignment: .top, spacing: 6) { + Image(systemName: "eye.circle.fill") + .foregroundColor(.blue) + .font(.subheadline) + Text(message.response) + .font(.body) + .foregroundColor(.primary) + .textSelection(.enabled) + } + + // Timestamp + Text(message.formattedTime) + .font(.caption2) + .foregroundColor(.secondary) + .frame(maxWidth: .infinity, alignment: .trailing) + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(16) + .animation(.easeInOut(duration: 0.3), value: isExpanded) + } +} + +// MARK: - Camera View + +struct CameraView: UIViewControllerRepresentable { + @Binding var image: UIImage? + @Environment(\.dismiss) private var dismiss + + func makeUIViewController(context: Context) -> UIImagePickerController { + let picker = UIImagePickerController() + picker.sourceType = .camera + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, UIImagePickerControllerDelegate, UINavigationControllerDelegate { + let parent: CameraView + + init(_ parent: CameraView) { + self.parent = parent + } + + func imagePickerController( + _ picker: UIImagePickerController, + didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any] + ) { + if let uiImage = info[.originalImage] as? UIImage { + parent.image = uiImage + } + parent.dismiss() + } + + func imagePickerControllerDidCancel(_ picker: UIImagePickerController) { + parent.dismiss() + } + } +} + +// MARK: - Preview + +#Preview { + ContentView() +} diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/Info.plist b/creative_apps/SmolVLMDemo/SmolVLMDemo/Info.plist new file mode 100644 index 0000000..bf004ac --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSCameraUsageDescription + This app needs camera access to capture images for visual question answering. + NSPhotoLibraryUsageDescription + This app needs photo library access for selecting images to analyze. + + diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo/SmolVLMDemoApp.swift b/creative_apps/SmolVLMDemo/SmolVLMDemo/SmolVLMDemoApp.swift new file mode 100644 index 0000000..cd2aa43 --- /dev/null +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo/SmolVLMDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct SmolVLMDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj b/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..ec226e6 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + WH0001 /* WhisperDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0002; }; + WH0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0004; }; + WH0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = WH0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + WH0007 /* WhisperDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = WhisperDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + WH0002 /* WhisperDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperDemoApp.swift; sourceTree = ""; }; + WH0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + WH0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + WH0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + WH0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + WH0010 = { + isa = PBXGroup; + children = ( + WH0011 /* WhisperDemo */, + WH0012 /* Products */, + ); + sourceTree = ""; + }; + WH0011 /* WhisperDemo */ = { + isa = PBXGroup; + children = ( + WH0002 /* WhisperDemoApp.swift */, + WH0004 /* ContentView.swift */, + WH0006 /* Assets.xcassets */, + WH0008 /* Info.plist */, + ); + path = WhisperDemo; + sourceTree = ""; + }; + WH0012 /* Products */ = { + isa = PBXGroup; + children = ( + WH0007 /* WhisperDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + WH0013 /* WhisperDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = WH0014; + buildPhases = ( + WH0015 /* Sources */, + WH0009 /* Frameworks */, + WH0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = WhisperDemo; + productName = WhisperDemo; + productReference = WH0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + WH0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + WH0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = WH0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = WH0010; + productRefGroup = WH0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + WH0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + WH0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + WH0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + WH0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + WH0001 /* WhisperDemoApp.swift in Sources */, + WH0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + WH0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + WH0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + WH0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = WhisperDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.whisperdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + WH0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = WhisperDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.whisperdemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + WH0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + WH0019, + WH0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + WH0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + WH0021, + WH0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = WH0017; +} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift b/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift new file mode 100644 index 0000000..d249cc9 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift @@ -0,0 +1,830 @@ +import SwiftUI +import CoreML +import AVFoundation +import Accelerate + +// MARK: - Whisper Tiny Speech Recognition Demo +// +// Whisper is a general-purpose speech recognition model by OpenAI. +// The encoder processes a mel spectrogram (80 bins x 3000 frames for 30s of audio) +// and produces hidden states that the decoder uses autoregressively to generate tokens. +// +// This demo records audio via the microphone, computes a log-mel spectrogram using +// the Accelerate framework (vDSP), runs the WhisperTiny encoder CoreML model, and +// displays transcription results. The decoder step is simplified for demonstration; +// a production app should use WhisperKit or a full encoder+decoder pipeline. + +// MARK: - Supported Languages + +enum WhisperLanguage: String, CaseIterable, Identifiable { + case english = "English" + case japanese = "Japanese" + case spanish = "Spanish" + case french = "French" + case german = "German" + case chinese = "Chinese" + case korean = "Korean" + case portuguese = "Portuguese" + + var id: String { rawValue } + + var code: String { + switch self { + case .english: return "en" + case .japanese: return "ja" + case .spanish: return "es" + case .french: return "fr" + case .german: return "de" + case .chinese: return "zh" + case .korean: return "ko" + case .portuguese: return "pt" + } + } +} + +// MARK: - Transcription Entry + +struct TranscriptionEntry: Identifiable { + let id = UUID() + let text: String + let language: WhisperLanguage + let timestamp: Date + let duration: TimeInterval +} + +// MARK: - ContentView + +struct ContentView: View { + @StateObject private var viewModel = WhisperViewModel() + + var body: some View { + NavigationStack { + VStack(spacing: 0) { + // Language picker + HStack { + Text("Language") + .font(.subheadline) + .foregroundColor(.secondary) + Spacer() + Picker("Language", selection: $viewModel.selectedLanguage) { + ForEach(WhisperLanguage.allCases) { lang in + Text(lang.rawValue).tag(lang) + } + } + .pickerStyle(.menu) + } + .padding(.horizontal) + .padding(.top, 8) + + Divider() + .padding(.vertical, 8) + + // Waveform visualization + WaveformVisualization( + samples: viewModel.audioSamples, + isRecording: viewModel.isRecording + ) + .frame(height: 100) + .padding(.horizontal) + .padding(.bottom, 8) + + // Recording controls + VStack(spacing: 12) { + RecordButton( + isRecording: viewModel.isRecording, + onTap: { + if viewModel.isRecording { + viewModel.stopRecording() + } else { + viewModel.startRecording() + } + } + ) + + Text(viewModel.isRecording ? "Tap to stop recording" : "Tap to start recording") + .font(.caption) + .foregroundColor(.secondary) + + if viewModel.isRecording { + Text(viewModel.formattedRecordingDuration) + .font(.system(.title3, design: .monospaced)) + .foregroundColor(.red) + } + } + .padding(.vertical, 12) + + // Processing indicator + if viewModel.isProcessing { + VStack(spacing: 8) { + ProgressView() + .scaleEffect(1.2) + Text(viewModel.processingStatus) + .font(.caption) + .foregroundColor(.secondary) + ProgressView(value: viewModel.processingProgress) + .progressViewStyle(.linear) + .padding(.horizontal, 40) + } + .padding() + } + + // Error display + if let error = viewModel.errorMessage { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.red) + Text(error) + .font(.caption) + .foregroundColor(.red) + } + .padding() + .frame(maxWidth: .infinity) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + .padding(.horizontal) + } + + // Current transcription result + if let current = viewModel.currentTranscription { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Transcription") + .font(.headline) + Spacer() + Button(action: { viewModel.copyToClipboard(current.text) }) { + Image(systemName: "doc.on.doc") + .font(.body) + } + } + Text(current.text) + .font(.body) + .padding() + .frame(maxWidth: .infinity, alignment: .leading) + .background(Color(.systemGray6)) + .cornerRadius(10) + HStack { + Text(current.language.rawValue) + .font(.caption2) + .padding(.horizontal, 8) + .padding(.vertical, 2) + .background(Color.accentColor.opacity(0.15)) + .cornerRadius(4) + Text(formatDuration(current.duration)) + .font(.caption2) + .foregroundColor(.secondary) + } + } + .padding(.horizontal) + .padding(.vertical, 8) + } + + Divider() + .padding(.vertical, 4) + + // History list + if viewModel.transcriptionHistory.isEmpty && viewModel.currentTranscription == nil { + Spacer() + VStack(spacing: 12) { + Image(systemName: "waveform.circle") + .font(.system(size: 48)) + .foregroundColor(.secondary.opacity(0.5)) + Text("Record audio to begin transcription") + .font(.subheadline) + .foregroundColor(.secondary) + } + Spacer() + } else { + ScrollView { + LazyVStack(spacing: 10) { + ForEach(viewModel.transcriptionHistory) { entry in + TranscriptionRow( + entry: entry, + onCopy: { viewModel.copyToClipboard(entry.text) } + ) + } + } + .padding(.horizontal) + .padding(.vertical, 8) + } + } + } + .navigationTitle("Whisper Transcribe") + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + if !viewModel.transcriptionHistory.isEmpty { + Button("Clear") { + viewModel.clearHistory() + } + } + } + } + .onAppear { + viewModel.requestMicrophonePermission() + } + } + } + + private func formatDuration(_ duration: TimeInterval) -> String { + let seconds = Int(duration) + let ms = Int((duration - Double(seconds)) * 10) + return String(format: "%d.%ds", seconds, ms) + } +} + +// MARK: - Record Button + +struct RecordButton: View { + let isRecording: Bool + let onTap: () -> Void + + var body: some View { + Button(action: onTap) { + ZStack { + Circle() + .fill(isRecording ? Color.red.opacity(0.15) : Color.accentColor.opacity(0.1)) + .frame(width: 80, height: 80) + + Circle() + .fill(isRecording ? Color.red : Color.accentColor) + .frame(width: 60, height: 60) + + if isRecording { + RoundedRectangle(cornerRadius: 4) + .fill(Color.white) + .frame(width: 22, height: 22) + } else { + Circle() + .fill(Color.white) + .frame(width: 24, height: 24) + } + } + } + .buttonStyle(.plain) + .animation(.easeInOut(duration: 0.2), value: isRecording) + } +} + +// MARK: - Waveform Visualization + +struct WaveformVisualization: View { + let samples: [Float] + let isRecording: Bool + @State private var animationPhase: CGFloat = 0 + + var body: some View { + TimelineView(.animation(minimumInterval: 1.0 / 30.0)) { timeline in + Canvas { context, size in + let midY = size.height / 2 + let barWidth: CGFloat = 3 + let gap: CGFloat = 2 + let totalBarWidth = barWidth + gap + let barCount = Int(size.width / totalBarWidth) + + if isRecording && !samples.isEmpty { + let step = max(1, samples.count / barCount) + for i in 0.. Void + + var body: some View { + VStack(alignment: .leading, spacing: 6) { + HStack { + Text(entry.language.rawValue) + .font(.caption2) + .fontWeight(.medium) + .padding(.horizontal, 6) + .padding(.vertical, 2) + .background(Color.accentColor.opacity(0.12)) + .cornerRadius(4) + + Text(entry.timestamp, style: .time) + .font(.caption2) + .foregroundColor(.secondary) + + Spacer() + + Button(action: onCopy) { + Image(systemName: "doc.on.doc") + .font(.caption) + .foregroundColor(.secondary) + } + } + + Text(entry.text) + .font(.body) + .lineLimit(4) + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(10) + } +} + +// MARK: - Clamped Extension + +private extension Double { + func clamped(to range: ClosedRange) -> Double { + return min(max(self, range.lowerBound), range.upperBound) + } +} + +// MARK: - WhisperViewModel + +class WhisperViewModel: ObservableObject { + @Published var selectedLanguage: WhisperLanguage = .english + @Published var isRecording = false + @Published var isProcessing = false + @Published var processingStatus = "" + @Published var processingProgress: Double = 0 + @Published var errorMessage: String? + @Published var currentTranscription: TranscriptionEntry? + @Published var transcriptionHistory: [TranscriptionEntry] = [] + @Published var audioSamples: [Float] = [] + @Published var recordingDuration: TimeInterval = 0 + + private var audioRecorder: AVAudioRecorder? + private var recordingURL: URL? + private var recordingTimer: Timer? + private var sampleTimer: Timer? + private var recordingStartTime: Date? + + var formattedRecordingDuration: String { + let minutes = Int(recordingDuration) / 60 + let seconds = Int(recordingDuration) % 60 + let tenths = Int((recordingDuration - floor(recordingDuration)) * 10) + return String(format: "%d:%02d.%d", minutes, seconds, tenths) + } + + // MARK: - Microphone Permission + + func requestMicrophonePermission() { + AVAudioSession.sharedInstance().requestRecordPermission { [weak self] granted in + DispatchQueue.main.async { + if !granted { + self?.errorMessage = "Microphone access denied. Please enable it in Settings." + } + } + } + } + + // MARK: - Recording + + func startRecording() { + errorMessage = nil + currentTranscription = nil + + let session = AVAudioSession.sharedInstance() + do { + try session.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker]) + try session.setActive(true) + } catch { + errorMessage = "Failed to configure audio session: \(error.localizedDescription)" + return + } + + let tempDir = FileManager.default.temporaryDirectory + let fileName = "whisper_recording_\(UUID().uuidString).wav" + let fileURL = tempDir.appendingPathComponent(fileName) + recordingURL = fileURL + + let settings: [String: Any] = [ + AVFormatIDKey: Int(kAudioFormatLinearPCM), + AVSampleRateKey: 16000.0, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsFloatKey: false, + AVLinearPCMIsBigEndianKey: false + ] + + do { + audioRecorder = try AVAudioRecorder(url: fileURL, settings: settings) + audioRecorder?.isMeteringEnabled = true + audioRecorder?.record() + isRecording = true + recordingStartTime = Date() + recordingDuration = 0 + audioSamples = [] + startTimers() + } catch { + errorMessage = "Failed to start recording: \(error.localizedDescription)" + } + } + + func stopRecording() { + guard isRecording else { return } + + audioRecorder?.stop() + isRecording = false + stopTimers() + + let duration = recordingDuration + + guard let url = recordingURL else { + errorMessage = "Recording file not found." + return + } + + processRecording(url: url, duration: duration) + } + + private func startTimers() { + recordingTimer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in + guard let self = self, let start = self.recordingStartTime else { return } + DispatchQueue.main.async { + self.recordingDuration = Date().timeIntervalSince(start) + } + } + + sampleTimer = Timer.scheduledTimer(withTimeInterval: 0.05, repeats: true) { [weak self] _ in + guard let self = self else { return } + self.audioRecorder?.updateMeters() + let power = self.audioRecorder?.averagePower(forChannel: 0) ?? -160 + // Convert dB to linear amplitude (0..1) + let linear = pow(10, power / 20) + DispatchQueue.main.async { + self.audioSamples.append(linear) + // Keep a rolling window of samples for visualization + if self.audioSamples.count > 400 { + self.audioSamples.removeFirst(self.audioSamples.count - 400) + } + } + } + } + + private func stopTimers() { + recordingTimer?.invalidate() + recordingTimer = nil + sampleTimer?.invalidate() + sampleTimer = nil + } + + // MARK: - Audio Processing + + private func processRecording(url: URL, duration: TimeInterval) { + isProcessing = true + errorMessage = nil + processingProgress = 0 + processingStatus = "Loading audio..." + + Task { + do { + let transcription = try await runWhisperPipeline(url: url, duration: duration) + await MainActor.run { + let entry = TranscriptionEntry( + text: transcription, + language: self.selectedLanguage, + timestamp: Date(), + duration: duration + ) + self.currentTranscription = entry + self.transcriptionHistory.insert(entry, at: 0) + self.isProcessing = false + self.processingProgress = 1.0 + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + + // Clean up temp file + try? FileManager.default.removeItem(at: url) + } + } + + /// Full Whisper pipeline: load audio -> compute mel spectrogram -> run encoder -> decode + /// + /// NOTE: The decoder step is simplified here. A full implementation would: + /// 1. Feed encoder output into the decoder model autoregressively + /// 2. Use greedy or beam search to generate token IDs + /// 3. Decode token IDs using the Whisper tokenizer + /// For production use, consider WhisperKit (github.com/argmaxinc/WhisperKit). + private func runWhisperPipeline(url: URL, duration: TimeInterval) async throws -> String { + // Step 1: Load audio samples from WAV file + await updateProgress("Loading audio file...", progress: 0.1) + + let audioData = try loadAudioSamples(from: url) + + // Step 2: Compute log-mel spectrogram using Accelerate + await updateProgress("Computing mel spectrogram...", progress: 0.3) + + let melSpectrogram = try computeMelSpectrogram(from: audioData) + + // Step 3: Load and run encoder model + await updateProgress("Running Whisper encoder...", progress: 0.5) + + guard let modelURL = Bundle.main.url(forResource: "WhisperTinyEncoder", withExtension: "mlmodelc") else { + throw WhisperError.modelNotFound( + "WhisperTinyEncoder.mlmodelc not found in bundle. " + + "Run convert_whisper.py to generate the model, then add the compiled " + + "WhisperTinyEncoder.mlmodelc to the Xcode project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + // Prepare mel input: shape (1, 80, 3000) + let melInput = try MLMultiArray(shape: [1, 80, 3000], dataType: .float32) + let melCount = min(melSpectrogram.count, 80 * 3000) + for i in 0.. [Float] { + let fileData = try Data(contentsOf: url) + + // WAV header is 44 bytes; PCM 16-bit mono samples follow + guard fileData.count > 44 else { + throw WhisperError.processingFailed("Audio file too short or corrupted.") + } + + let sampleData = fileData.dropFirst(44) + let sampleCount = sampleData.count / 2 // 16-bit = 2 bytes per sample + + var floatSamples = [Float](repeating: 0, count: sampleCount) + sampleData.withUnsafeBytes { rawBuffer in + guard let baseAddress = rawBuffer.baseAddress else { return } + let int16Ptr = baseAddress.bindMemory(to: Int16.self, capacity: sampleCount) + // Convert Int16 samples to Float32 normalized to [-1, 1] + var source = UnsafePointer(int16Ptr) + var destination = UnsafeMutablePointer(&floatSamples) + // Use vDSP for efficient conversion + vDSP_vflt16(source, 1, &floatSamples, 1, vDSP_Length(sampleCount)) + var scale: Float = 1.0 / 32768.0 + vDSP_vsmul(floatSamples, 1, &scale, &floatSamples, 1, vDSP_Length(sampleCount)) + } + + return floatSamples + } + + /// Compute 80-bin log-mel spectrogram from audio samples + /// + /// Whisper expects: 80 mel bins, 3000 time frames (for 30s at 16kHz with hop=160). + /// Parameters: FFT size = 400, hop length = 160, sample rate = 16000. + /// + /// This implementation uses Accelerate's vDSP for the FFT computation. + private func computeMelSpectrogram(from samples: [Float]) throws -> [Float] { + let fftSize = 400 + let hopLength = 160 + let numMelBins = 80 + let maxFrames = 3000 + let sampleRate: Float = 16000.0 + + // Pad or truncate audio to 30 seconds (480000 samples) + let targetLength = 480000 + var paddedSamples: [Float] + if samples.count >= targetLength { + paddedSamples = Array(samples.prefix(targetLength)) + } else { + paddedSamples = samples + [Float](repeating: 0, count: targetLength - samples.count) + } + + // Number of frames + let numFrames = min((paddedSamples.count - fftSize) / hopLength + 1, maxFrames) + + // Create FFT setup + let log2n = vDSP_Length(ceil(log2(Float(fftSize)))) + guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { + throw WhisperError.processingFailed("Failed to create FFT setup.") + } + defer { vDSP_destroy_fftsetup(fftSetup) } + + let fftSizeAligned = Int(pow(2, ceil(log2(Float(fftSize))))) + let halfFFT = fftSizeAligned / 2 + + // Hann window + var window = [Float](repeating: 0, count: fftSize) + vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM)) + + // Compute mel filter bank (simplified triangular filters) + let melFilters = createMelFilterBank( + numMelBins: numMelBins, + fftSize: fftSizeAligned, + sampleRate: sampleRate, + numFreqBins: halfFFT + 1 + ) + + // Output: (numMelBins x numFrames) stored row-major + var melSpectrogram = [Float](repeating: 0, count: numMelBins * maxFrames) + + // Process each frame + for frame in 0.. [Float] { + func hzToMel(_ hz: Float) -> Float { + return 2595.0 * log10(1.0 + hz / 700.0) + } + + func melToHz(_ mel: Float) -> Float { + return 700.0 * (pow(10.0, mel / 2595.0) - 1.0) + } + + let lowFreq: Float = 0 + let highFreq = sampleRate / 2.0 + let lowMel = hzToMel(lowFreq) + let highMel = hzToMel(highFreq) + + // Equally spaced mel points + let numPoints = numMelBins + 2 + var melPoints = [Float](repeating: 0, count: numPoints) + for i in 0.. left { + filters[m * numFreqBins + k] = Float(k - left) / Float(center - left) + } + } + for k in center.. center { + filters[m * numFreqBins + k] = Float(right - k) / Float(right - center) + } + } + } + + return filters + } + + @MainActor + private func updateProgress(_ status: String, progress: Double) { + self.processingStatus = status + self.processingProgress = progress + } + + // MARK: - Clipboard + + func copyToClipboard(_ text: String) { + UIPasteboard.general.string = text + } + + // MARK: - History + + func clearHistory() { + transcriptionHistory.removeAll() + currentTranscription = nil + } +} + +// MARK: - Errors + +enum WhisperError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Info.plist b/creative_apps/WhisperDemo/WhisperDemo/Info.plist new file mode 100644 index 0000000..53711f3 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSMicrophoneUsageDescription + This app needs microphone access for speech recognition. + + diff --git a/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift b/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift new file mode 100644 index 0000000..6468c10 --- /dev/null +++ b/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct WhisperDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj b/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..18769dc --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj @@ -0,0 +1,270 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + YE0001 /* YOLOEDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = YE0002; }; + YE0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = YE0004; }; + YE0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = YE0006; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + YE0007 /* YOLOEDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = YOLOEDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + YE0002 /* YOLOEDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = YOLOEDemoApp.swift; sourceTree = ""; }; + YE0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + YE0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + YE0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + YE0009 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + YE0010 = { + isa = PBXGroup; + children = ( + YE0011 /* YOLOEDemo */, + YE0012 /* Products */, + ); + sourceTree = ""; + }; + YE0011 /* YOLOEDemo */ = { + isa = PBXGroup; + children = ( + YE0002 /* YOLOEDemoApp.swift */, + YE0004 /* ContentView.swift */, + YE0006 /* Assets.xcassets */, + YE0008 /* Info.plist */, + ); + path = YOLOEDemo; + sourceTree = ""; + }; + YE0012 /* Products */ = { + isa = PBXGroup; + children = ( + YE0007 /* YOLOEDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + YE0013 /* YOLOEDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = YE0014; + buildPhases = ( + YE0015 /* Sources */, + YE0009 /* Frameworks */, + YE0016 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = YOLOEDemo; + productName = YOLOEDemo; + productReference = YE0007; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + YE0017 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + YE0013 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = YE0018; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = YE0010; + productRefGroup = YE0012; + projectDirPath = ""; + projectRoot = ""; + targets = ( + YE0013, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + YE0016 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + YE0005 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + YE0015 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + YE0001 /* YOLOEDemoApp.swift in Sources */, + YE0003 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + YE0019 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + YE0020 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + YE0021 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = YOLOEDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.yoloedemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + YE0022 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = YOLOEDemo/Info.plist; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.yoloedemo"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + YE0018 /* Build configuration list for PBXProject */ = { + isa = XCConfigurationList; + buildConfigurations = ( + YE0019, + YE0020, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + YE0014 /* Build configuration list for PBXNativeTarget */ = { + isa = XCConfigurationList; + buildConfigurations = ( + YE0021, + YE0022, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = YE0017; +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000..13613e3 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/Contents.json b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/ContentView.swift b/creative_apps/YOLOEDemo/YOLOEDemo/ContentView.swift new file mode 100644 index 0000000..912c691 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/ContentView.swift @@ -0,0 +1,982 @@ +import SwiftUI +import UIKit +import CoreML +import Vision +import PhotosUI +import AVFoundation + +// MARK: - YOLOE Open-Vocabulary Detection & Segmentation Demo +// +// YOLOE: Real-Time Seeing Anything (ICCV 2025) +// https://github.com/THU-MIG/yoloe +// +// This app demonstrates open-vocabulary object detection and instance segmentation. +// Users can type any text prompt (e.g., "coffee mug", "red car") and the model +// detects matching objects with bounding boxes and segmentation masks. +// +// Model: YOLOE-S exported to CoreML (YOLOE_S.mlmodelc) +// Input: 640x640 RGB image +// Output: bounding boxes, class confidence scores, segmentation masks +// Post-processing: Non-Maximum Suppression (NMS), confidence filtering + +// MARK: - Detection Mode + +enum DetectionMode: String, CaseIterable, Identifiable { + case detection = "Detection" + case segmentation = "Segmentation" + + var id: String { rawValue } + + var icon: String { + switch self { + case .detection: return "rectangle.dashed" + case .segmentation: return "paintbrush.pointed.fill" + } + } +} + +// MARK: - Detection Result + +struct DetectionResult: Identifiable { + let id = UUID() + let label: String + let confidence: Float + let boundingBox: CGRect + let maskData: [Float]? + let color: Color + + var confidencePercent: String { + String(format: "%.1f%%", confidence * 100) + } +} + +// MARK: - Preset Prompt Chips + +struct PromptChip: Identifiable { + let id = UUID() + let label: String + let icon: String +} + +let presetChips: [PromptChip] = [ + PromptChip(label: "person", icon: "person.fill"), + PromptChip(label: "car", icon: "car.fill"), + PromptChip(label: "dog", icon: "dog.fill"), + PromptChip(label: "phone", icon: "iphone"), + PromptChip(label: "food", icon: "fork.knife"), + PromptChip(label: "text", icon: "textformat"), +] + +// MARK: - Color Palette for Detection Classes + +let detectionColors: [Color] = [ + .red, .blue, .green, .orange, .purple, .pink, + .cyan, .yellow, .mint, .indigo, .teal, .brown +] + +func colorForIndex(_ index: Int) -> Color { + detectionColors[index % detectionColors.count] +} + +// MARK: - ContentView + +struct ContentView: View { + @StateObject private var viewModel = YOLOEViewModel() + @State private var showCamera = false + + var body: some View { + NavigationStack { + ScrollView { + VStack(spacing: 16) { + // Image input section + imageSection + + // Text prompt section + promptSection + + // Mode toggle + modeToggleSection + + // Detect button + detectButton + + // Progress indicator + if viewModel.isProcessing { + progressSection + } + + // Error display + if let error = viewModel.errorMessage { + errorSection(error) + } + + // Detection overlay on image + if !viewModel.detections.isEmpty, let image = viewModel.inputImage { + detectionOverlaySection(image: image) + } + + // Results list + if !viewModel.detections.isEmpty { + resultsListSection + } + } + .padding() + } + .navigationTitle("YOLOE Detector") + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Menu { + Button(action: { viewModel.showPhotoPicker = true }) { + Label("Photo Library", systemImage: "photo.on.rectangle") + } + Button(action: { showCamera = true }) { + Label("Camera", systemImage: "camera") + } + } label: { + Image(systemName: "plus.circle.fill") + .font(.title3) + } + } + } + .photosPicker(isPresented: $viewModel.showPhotoPicker, selection: $viewModel.selectedPhoto, matching: .images) + .onChange(of: viewModel.selectedPhoto) { _ in + viewModel.loadSelectedPhoto() + } + .fullScreenCover(isPresented: $showCamera) { + CameraPickerView(image: $viewModel.inputImage) + .ignoresSafeArea() + } + } + } + + // MARK: - Image Section + + private var imageSection: some View { + Group { + if let image = viewModel.inputImage { + ZStack(alignment: .topTrailing) { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxHeight: 300) + .cornerRadius(12) + + Button(action: { viewModel.clearImage() }) { + Image(systemName: "xmark.circle.fill") + .font(.title2) + .foregroundColor(.white) + .shadow(radius: 2) + } + .padding(8) + } + } else { + VStack(spacing: 12) { + Image(systemName: "viewfinder") + .font(.system(size: 48)) + .foregroundColor(.secondary) + Text("Select an Image") + .font(.headline) + .foregroundColor(.secondary) + Text("Use the + button to pick from library or camera") + .font(.caption) + .foregroundColor(.secondary.opacity(0.7)) + .multilineTextAlignment(.center) + } + .frame(maxWidth: .infinity) + .frame(height: 200) + .background(Color(.systemGray6)) + .cornerRadius(12) + } + } + } + + // MARK: - Prompt Section + + private var promptSection: some View { + VStack(alignment: .leading, spacing: 10) { + Text("What to detect") + .font(.headline) + + HStack { + Image(systemName: "magnifyingglass") + .foregroundColor(.secondary) + TextField("e.g. coffee mug, red car, person with hat", text: $viewModel.promptText) + .textFieldStyle(.plain) + .autocorrectionDisabled() + if !viewModel.promptText.isEmpty { + Button(action: { viewModel.promptText = "" }) { + Image(systemName: "xmark.circle.fill") + .foregroundColor(.secondary) + } + } + } + .padding(12) + .background(Color(.systemGray6)) + .cornerRadius(10) + + // Preset chips + ScrollView(.horizontal, showsIndicators: false) { + HStack(spacing: 8) { + ForEach(presetChips) { chip in + Button(action: { + appendPrompt(chip.label) + }) { + HStack(spacing: 4) { + Image(systemName: chip.icon) + .font(.caption) + Text(chip.label) + .font(.caption) + .fontWeight(.medium) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background( + viewModel.promptText.lowercased().contains(chip.label) + ? Color.accentColor.opacity(0.2) + : Color(.systemGray5) + ) + .foregroundColor( + viewModel.promptText.lowercased().contains(chip.label) + ? .accentColor + : .primary + ) + .cornerRadius(20) + } + } + } + } + } + } + + // MARK: - Mode Toggle + + private var modeToggleSection: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Mode") + .font(.headline) + + Picker("Mode", selection: $viewModel.detectionMode) { + ForEach(DetectionMode.allCases) { mode in + Label(mode.rawValue, systemImage: mode.icon) + .tag(mode) + } + } + .pickerStyle(.segmented) + } + } + + // MARK: - Detect Button + + private var detectButton: some View { + Button(action: { viewModel.runDetection() }) { + HStack { + if viewModel.isProcessing { + ProgressView() + .tint(.white) + } else { + Image(systemName: "sparkle.magnifyingglass") + } + Text(viewModel.isProcessing ? "Detecting..." : "Detect Objects") + .fontWeight(.semibold) + } + .frame(maxWidth: .infinity) + .padding() + .background( + (viewModel.inputImage != nil && !viewModel.promptText.isEmpty && !viewModel.isProcessing) + ? Color.accentColor + : Color.gray + ) + .foregroundColor(.white) + .cornerRadius(12) + } + .disabled(viewModel.inputImage == nil || viewModel.promptText.isEmpty || viewModel.isProcessing) + } + + // MARK: - Progress Section + + private var progressSection: some View { + VStack(spacing: 8) { + ProgressView(value: viewModel.progress) + .progressViewStyle(.linear) + Text(viewModel.statusMessage) + .font(.caption) + .foregroundColor(.secondary) + } + } + + // MARK: - Error Section + + private func errorSection(_ error: String) -> some View { + HStack { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.red) + Text(error) + .font(.caption) + .foregroundColor(.red) + } + .padding() + .frame(maxWidth: .infinity, alignment: .leading) + .background(Color.red.opacity(0.1)) + .cornerRadius(8) + } + + // MARK: - Detection Overlay Section + + private func detectionOverlaySection(image: UIImage) -> some View { + VStack(alignment: .leading, spacing: 8) { + Text("Results") + .font(.headline) + + GeometryReader { geometry in + let aspectRatio = image.size.width / image.size.height + let displayWidth = geometry.size.width + let displayHeight = displayWidth / aspectRatio + + ZStack(alignment: .topLeading) { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fit) + + // Segmentation masks + if viewModel.detectionMode == .segmentation { + ForEach(viewModel.detections) { det in + if let maskData = det.maskData { + MaskOverlayView( + maskData: maskData, + color: det.color, + displaySize: CGSize(width: displayWidth, height: displayHeight) + ) + } + } + } + + // Bounding boxes + ForEach(viewModel.detections) { det in + let rect = convertBoundingBox( + det.boundingBox, + toViewSize: CGSize(width: displayWidth, height: displayHeight) + ) + + Rectangle() + .stroke(det.color, lineWidth: 2) + .frame(width: rect.width, height: rect.height) + .overlay(alignment: .topLeading) { + Text("\(det.label) \(det.confidencePercent)") + .font(.system(size: 10, weight: .bold)) + .foregroundColor(.white) + .padding(.horizontal, 4) + .padding(.vertical, 2) + .background(det.color.opacity(0.85)) + .cornerRadius(4) + .offset(y: -18) + } + .position(x: rect.midX, y: rect.midY) + } + } + .frame(width: displayWidth, height: displayHeight) + } + .aspectRatio(image.size.width / image.size.height, contentMode: .fit) + } + } + + // MARK: - Results List + + private var resultsListSection: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Detected Objects") + .font(.headline) + Spacer() + Text("\(viewModel.detections.count) found") + .font(.caption) + .foregroundColor(.secondary) + } + + ForEach(viewModel.detections) { detection in + DetectionRowView( + detection: detection, + sourceImage: viewModel.inputImage + ) + } + } + } + + // MARK: - Helpers + + private func appendPrompt(_ text: String) { + if viewModel.promptText.isEmpty { + viewModel.promptText = text + } else if !viewModel.promptText.lowercased().contains(text.lowercased()) { + viewModel.promptText += ", \(text)" + } + } + + private func convertBoundingBox(_ bbox: CGRect, toViewSize size: CGSize) -> CGRect { + let x = bbox.origin.x * size.width + let y = bbox.origin.y * size.height + let w = bbox.size.width * size.width + let h = bbox.size.height * size.height + return CGRect(x: x, y: y, width: w, height: h) + } +} + +// MARK: - Detection Row View + +struct DetectionRowView: View { + let detection: DetectionResult + let sourceImage: UIImage? + + var body: some View { + HStack(spacing: 12) { + // Cropped thumbnail + if let thumb = croppedThumbnail() { + Image(uiImage: thumb) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: 50, height: 50) + .cornerRadius(8) + .clipped() + } else { + RoundedRectangle(cornerRadius: 8) + .fill(detection.color.opacity(0.2)) + .frame(width: 50, height: 50) + .overlay { + Image(systemName: "cube.box") + .foregroundColor(detection.color) + } + } + + VStack(alignment: .leading, spacing: 4) { + Text(detection.label) + .font(.body) + .fontWeight(.medium) + + HStack(spacing: 8) { + // Confidence bar + GeometryReader { geo in + ZStack(alignment: .leading) { + RoundedRectangle(cornerRadius: 2) + .fill(Color(.systemGray5)) + RoundedRectangle(cornerRadius: 2) + .fill(detection.color) + .frame(width: geo.size.width * CGFloat(detection.confidence)) + } + } + .frame(height: 6) + + Text(detection.confidencePercent) + .font(.caption) + .foregroundColor(.secondary) + .frame(width: 44, alignment: .trailing) + } + } + + Spacer() + + Circle() + .fill(detection.color) + .frame(width: 12, height: 12) + } + .padding() + .background(Color(.systemGray6)) + .cornerRadius(10) + } + + private func croppedThumbnail() -> UIImage? { + guard let source = sourceImage else { return nil } + let bbox = detection.boundingBox + let cropRect = CGRect( + x: bbox.origin.x * source.size.width, + y: bbox.origin.y * source.size.height, + width: bbox.width * source.size.width, + height: bbox.height * source.size.height + ) + guard cropRect.width > 0, cropRect.height > 0 else { return nil } + guard let cgImage = source.cgImage?.cropping(to: cropRect) else { return nil } + return UIImage(cgImage: cgImage) + } +} + +// MARK: - Mask Overlay View + +struct MaskOverlayView: View { + let maskData: [Float] + let color: Color + let displaySize: CGSize + + var body: some View { + Canvas { context, size in + let maskWidth = 160 + let maskHeight = 160 + let scaleX = size.width / CGFloat(maskWidth) + let scaleY = size.height / CGFloat(maskHeight) + + for y in 0.. 0.5 { + let rect = CGRect( + x: CGFloat(x) * scaleX, + y: CGFloat(y) * scaleY, + width: scaleX + 0.5, + height: scaleY + 0.5 + ) + context.fill(Path(rect), with: .color(color.opacity(0.35))) + } + } + } + } + .frame(width: displaySize.width, height: displaySize.height) + .allowsHitTesting(false) + } +} + +// MARK: - Camera Picker + +struct CameraPickerView: UIViewControllerRepresentable { + @Binding var image: UIImage? + @Environment(\.dismiss) private var dismiss + + func makeUIViewController(context: Context) -> UIImagePickerController { + let picker = UIImagePickerController() + picker.sourceType = .camera + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {} + + func makeCoordinator() -> Coordinator { + Coordinator(self) + } + + class Coordinator: NSObject, UIImagePickerControllerDelegate, UINavigationControllerDelegate { + let parent: CameraPickerView + + init(_ parent: CameraPickerView) { + self.parent = parent + } + + func imagePickerController(_ picker: UIImagePickerController, + didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) { + if let uiImage = info[.originalImage] as? UIImage { + parent.image = uiImage + } + parent.dismiss() + } + + func imagePickerControllerDidCancel(_ picker: UIImagePickerController) { + parent.dismiss() + } + } +} + +// MARK: - ViewModel + +class YOLOEViewModel: ObservableObject { + @Published var inputImage: UIImage? + @Published var selectedPhoto: PhotosPickerItem? + @Published var showPhotoPicker = false + @Published var promptText = "" + @Published var detectionMode: DetectionMode = .detection + @Published var isProcessing = false + @Published var progress: Double = 0 + @Published var statusMessage = "" + @Published var errorMessage: String? + @Published var detections: [DetectionResult] = [] + + private var mlModel: MLModel? + + // MARK: - Load Photo from Picker + + func loadSelectedPhoto() { + guard let item = selectedPhoto else { return } + Task { + if let data = try? await item.loadTransferable(type: Data.self), + let uiImage = UIImage(data: data) { + await MainActor.run { + self.inputImage = uiImage + self.detections = [] + self.errorMessage = nil + } + } + } + } + + func clearImage() { + inputImage = nil + selectedPhoto = nil + detections = [] + errorMessage = nil + } + + // MARK: - Run Detection + + func runDetection() { + guard let image = inputImage, !promptText.isEmpty else { return } + isProcessing = true + errorMessage = nil + detections = [] + progress = 0 + + Task { + do { + let results = try await performDetection(image: image, prompt: promptText) + await MainActor.run { + self.detections = results + self.isProcessing = false + self.progress = 1.0 + self.statusMessage = "Done" + } + } catch { + await MainActor.run { + self.errorMessage = error.localizedDescription + self.isProcessing = false + } + } + } + } + + // MARK: - CoreML Inference Pipeline + + private func performDetection(image: UIImage, prompt: String) async throws -> [DetectionResult] { + await updateStatus("Loading model...", progress: 0.1) + + // Load the YOLOE-S CoreML model + guard let modelURL = Bundle.main.url(forResource: "YOLOE_S", withExtension: "mlmodelc") else { + throw YOLOEError.modelNotFound( + "YOLOE_S.mlmodelc not found in bundle. " + + "Please run convert_yoloe.py to export the model and add the compiled " + + "YOLOE_S.mlmodelc to the Xcode project." + ) + } + + let config = MLModelConfiguration() + config.computeUnits = .cpuAndNeuralEngine + let model = try MLModel(contentsOf: modelURL, configuration: config) + + await updateStatus("Preprocessing image...", progress: 0.3) + + // Resize image to 640x640 for model input + guard let resizedImage = resizeImage(image, to: CGSize(width: 640, height: 640)), + let pixelBuffer = resizedImage.toPixelBuffer(width: 640, height: 640) else { + throw YOLOEError.processingFailed("Failed to preprocess input image.") + } + + await updateStatus("Running YOLOE inference...", progress: 0.5) + + // Parse prompt into individual class labels + let classLabels = prompt + .split(separator: ",") + .map { $0.trimmingCharacters(in: .whitespaces) } + .filter { !$0.isEmpty } + + // Run model prediction + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ + "image": MLFeatureValue(pixelBuffer: pixelBuffer) + ]) + let output = try model.prediction(from: inputFeatures) + + await updateStatus("Post-processing...", progress: 0.75) + + // Extract output tensors + // YOLOE outputs: detection boxes, scores, class predictions, and optionally masks + let results = try parseModelOutput( + output: output, + classLabels: classLabels, + imageSize: image.size, + confidenceThreshold: 0.25, + iouThreshold: 0.45, + includeMasks: detectionMode == .segmentation + ) + + await updateStatus("Complete!", progress: 1.0) + return results + } + + // MARK: - Parse Model Output + + private func parseModelOutput( + output: MLFeatureProvider, + classLabels: [String], + imageSize: CGSize, + confidenceThreshold: Float, + iouThreshold: Float, + includeMasks: Bool + ) throws -> [DetectionResult] { + // Attempt to read the primary output feature + // YOLOE typically outputs a combined tensor with shape [1, num_detections, 4+num_classes+mask_dim] + // or separate outputs for boxes, scores, and masks. + // + // The exact output format depends on the export configuration. + // Common output names: "output0" (detection), "output1" (segmentation protos) + + var rawDetections: [(bbox: CGRect, confidence: Float, classIndex: Int, maskCoeffs: [Float]?)] = [] + + // Try to access detection output + let featureNames = output.featureNames + guard let primaryName = featureNames.first, + let primaryValue = output.featureValue(for: primaryName), + let detArray = primaryValue.multiArrayValue else { + throw YOLOEError.processingFailed("Could not read model output tensor.") + } + + let shape = detArray.shape.map { $0.intValue } + // Expected shape: [1, numPredictions, attributes] or [1, attributes, numPredictions] + // attributes = 4 (bbox) + numClasses + maskDim + + guard shape.count >= 2 else { + throw YOLOEError.processingFailed("Unexpected output shape: \(shape)") + } + + let numClasses = classLabels.count + let numPredictions: Int + let attributeDim: Int + + // YOLO-style output is typically [1, 4+numClasses+maskDim, numPredictions] (transposed) + if shape.count == 3 { + attributeDim = shape[1] + numPredictions = shape[2] + } else { + numPredictions = shape[0] + attributeDim = shape[1] + } + + let maskDim = max(0, attributeDim - 4 - numClasses) + let pointer = detArray.dataPointer.assumingMemoryBound(to: Float.self) + + for i in 0.. bestScore { + bestScore = score + bestClassIdx = c + } + } + + guard bestScore >= confidenceThreshold else { continue } + + // Convert from center format to origin format, normalized to 0..1 + let normX = (cx - w / 2.0) / 640.0 + let normY = (cy - h / 2.0) / 640.0 + let normW = w / 640.0 + let normH = h / 640.0 + + let bbox = CGRect( + x: CGFloat(max(0, normX)), + y: CGFloat(max(0, normY)), + width: CGFloat(min(1.0 - max(0, normX), max(0, normW))), + height: CGFloat(min(1.0 - max(0, normY), max(0, normH))) + ) + + // Extract mask coefficients if available + var maskCoeffs: [Float]? + if includeMasks && maskDim > 0 { + maskCoeffs = (0.. 1 { + let sortedNames = featureNames.sorted() + if let protoName = sortedNames.dropFirst().first, + let protoValue = output.featureValue(for: protoName), + let protoArray = protoValue.multiArrayValue { + let count = protoArray.count + protoData = Array(UnsafeBufferPointer(start: protoArray.dataPointer.assumingMemoryBound(to: Float.self), count: count)) + } + } + + // Convert to DetectionResult + let results: [DetectionResult] = nmsResults.enumerated().map { idx, det in + let label = det.classIndex < classLabels.count ? classLabels[det.classIndex] : "object" + let color = colorForIndex(det.classIndex) + + var maskPixels: [Float]? + if includeMasks, let coeffs = det.maskCoeffs, let protos = protoData { + maskPixels = generateMask(coefficients: coeffs, protos: protos, maskSize: 160) + } + + return DetectionResult( + label: label, + confidence: det.confidence, + boundingBox: det.bbox, + maskData: maskPixels, + color: color + ) + } + + return results + } + + // MARK: - Non-Maximum Suppression + + private func applyNMS( + detections: [(bbox: CGRect, confidence: Float, classIndex: Int, maskCoeffs: [Float]?)], + iouThreshold: Float + ) -> [(bbox: CGRect, confidence: Float, classIndex: Int, maskCoeffs: [Float]?)] { + let sorted = detections.sorted { $0.confidence > $1.confidence } + var selected: [(bbox: CGRect, confidence: Float, classIndex: Int, maskCoeffs: [Float]?)] = [] + + for det in sorted { + var shouldSelect = true + for sel in selected { + if det.classIndex == sel.classIndex && computeIoU(det.bbox, sel.bbox) > iouThreshold { + shouldSelect = false + break + } + } + if shouldSelect { + selected.append(det) + } + } + + return selected + } + + private func computeIoU(_ a: CGRect, _ b: CGRect) -> Float { + let intersection = a.intersection(b) + guard !intersection.isNull else { return 0 } + let intersectionArea = intersection.width * intersection.height + let unionArea = a.width * a.height + b.width * b.height - intersectionArea + guard unionArea > 0 else { return 0 } + return Float(intersectionArea / unionArea) + } + + // MARK: - Generate Segmentation Mask + + private func generateMask(coefficients: [Float], protos: [Float], maskSize: Int) -> [Float] { + // The mask is generated by: mask = sigmoid(coefficients . protos) + // protos shape: [maskDim, maskSize, maskSize], coefficients shape: [maskDim] + let totalPixels = maskSize * maskSize + var mask = [Float](repeating: 0, count: totalPixels) + + let maskDim = coefficients.count + for pixel in 0.. UIImage? { + UIGraphicsBeginImageContextWithOptions(size, true, 1.0) + image.draw(in: CGRect(origin: .zero, size: size)) + let resized = UIGraphicsGetImageFromCurrentImageContext() + UIGraphicsEndImageContext() + return resized + } + + @MainActor + private func updateStatus(_ message: String, progress: Double) { + self.statusMessage = message + self.progress = progress + } +} + +// MARK: - UIImage -> CVPixelBuffer + +extension UIImage { + func toPixelBuffer(width: Int, height: Int) -> CVPixelBuffer? { + let attrs: [CFString: Any] = [ + kCVPixelBufferCGImageCompatibilityKey: true, + kCVPixelBufferCGBitmapContextCompatibilityKey: true + ] + var pixelBuffer: CVPixelBuffer? + let status = CVPixelBufferCreate( + kCFAllocatorDefault, + width, height, + kCVPixelFormatType_32BGRA, + attrs as CFDictionary, + &pixelBuffer + ) + guard status == kCVReturnSuccess, let buffer = pixelBuffer else { return nil } + + CVPixelBufferLockBaseAddress(buffer, []) + defer { CVPixelBufferUnlockBaseAddress(buffer, []) } + + guard let context = CGContext( + data: CVPixelBufferGetBaseAddress(buffer), + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: CVPixelBufferGetBytesPerRow(buffer), + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.noneSkipFirst.rawValue | CGBitmapInfo.byteOrder32Little.rawValue + ) else { return nil } + + guard let cgImage = self.cgImage else { return nil } + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + return buffer + } +} + +// MARK: - Errors + +enum YOLOEError: LocalizedError { + case modelNotFound(String) + case processingFailed(String) + + var errorDescription: String? { + switch self { + case .modelNotFound(let msg): return msg + case .processingFailed(let msg): return msg + } + } +} + +#Preview { + ContentView() +} diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/Info.plist b/creative_apps/YOLOEDemo/YOLOEDemo/Info.plist new file mode 100644 index 0000000..7532403 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/Info.plist @@ -0,0 +1,10 @@ + + + + + NSCameraUsageDescription + This app needs camera access for open-vocabulary object detection. + NSPhotoLibraryUsageDescription + This app needs photo library access for selecting images to detect objects. + + diff --git a/creative_apps/YOLOEDemo/YOLOEDemo/YOLOEDemoApp.swift b/creative_apps/YOLOEDemo/YOLOEDemo/YOLOEDemoApp.swift new file mode 100644 index 0000000..f781c68 --- /dev/null +++ b/creative_apps/YOLOEDemo/YOLOEDemo/YOLOEDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct YOLOEDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj b/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..adab51d --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj @@ -0,0 +1,340 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + DW0000010000000000000001 /* DWPoseDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000001 /* DWPoseDemoApp.swift */; }; + DW0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000002 /* ContentView.swift */; }; + DW0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + DW0000020000000000000001 /* DWPoseDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DWPoseDemoApp.swift; sourceTree = ""; }; + DW0000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + DW0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + DW0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + DW0000020000000000000010 /* DWPoseDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DWPoseDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + DW0000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + DW0000040000000000000001 = { + isa = PBXGroup; + children = ( + DW0000040000000000000002 /* DWPoseDemo */, + DW0000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + DW0000040000000000000002 /* DWPoseDemo */ = { + isa = PBXGroup; + children = ( + DW0000020000000000000001 /* DWPoseDemoApp.swift */, + DW0000020000000000000002 /* ContentView.swift */, + DW0000020000000000000004 /* Assets.xcassets */, + DW0000020000000000000005 /* Info.plist */, + ); + path = DWPoseDemo; + sourceTree = ""; + }; + DW0000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + DW0000020000000000000010 /* DWPoseDemo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + DW0000050000000000000001 /* DWPoseDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = DW0000070000000000000001 /* Build configuration list for PBXNativeTarget "DWPoseDemo" */; + buildPhases = ( + DW0000060000000000000001 /* Sources */, + DW0000030000000000000001 /* Frameworks */, + DW0000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = DWPoseDemo; + productName = DWPoseDemo; + productReference = DW0000020000000000000010 /* DWPoseDemo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + DW0000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + DW0000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = DW0000070000000000000003 /* Build configuration list for PBXProject "DWPoseDemo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = DW0000040000000000000001; + productRefGroup = DW0000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + DW0000050000000000000001 /* DWPoseDemo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + DW0000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DW0000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + DW0000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DW0000010000000000000001 /* DWPoseDemoApp.swift in Sources */, + DW0000010000000000000002 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + DW0000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + DW0000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + DW0000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DWPoseDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time pose estimation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.dwpose"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + DW0000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DWPoseDemo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time pose estimation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.dwpose"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + DW0000070000000000000001 /* Build configuration list for PBXNativeTarget "DWPoseDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DW0000090000000000000003 /* Debug */, + DW0000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + DW0000070000000000000003 /* Build configuration list for PBXProject "DWPoseDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DW0000090000000000000001 /* Debug */, + DW0000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = DW0000080000000000000001 /* Project object */; +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift b/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift new file mode 100644 index 0000000..817df8a --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift @@ -0,0 +1,659 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Accelerate + +// MARK: - COCO Keypoint Definitions + +let keypointNames: [String] = [ + "nose", // 0 + "left_eye", // 1 + "right_eye", // 2 + "left_ear", // 3 + "right_ear", // 4 + "left_shoulder", // 5 + "right_shoulder", // 6 + "left_elbow", // 7 + "right_elbow", // 8 + "left_wrist", // 9 + "right_wrist", // 10 + "left_hip", // 11 + "right_hip", // 12 + "left_knee", // 13 + "right_knee", // 14 + "left_ankle", // 15 + "right_ankle", // 16 +] + +let skeletonConnections: [(Int, Int)] = [ + (0, 1), (0, 2), (1, 3), (2, 4), // Head + (5, 6), // Shoulders + (5, 7), (7, 9), // Left arm + (6, 8), (8, 10), // Right arm + (5, 11), (6, 12), // Torso + (11, 12), // Hips + (11, 13), (13, 15), // Left leg + (12, 14), (14, 16), // Right leg +] + +// Left-side keypoint indices (blue) +let leftIndices: Set = [1, 3, 5, 7, 9, 11, 13, 15] +// Right-side keypoint indices (red) +let rightIndices: Set = [2, 4, 6, 8, 10, 12, 14, 16] +// Center keypoint indices (green) +let centerIndices: Set = [0] + +// MARK: - Keypoint + +struct Keypoint { + let x: CGFloat + let y: CGFloat + let confidence: Float +} + +// MARK: - Connection Color Helper + +func connectionColor(for connection: (Int, Int)) -> Color { + let (a, b) = connection + // Shoulder-to-shoulder and hip-to-hip are center connections + if (a == 5 && b == 6) || (a == 11 && b == 12) { + return .green + } + // Torso connections use the side of the limb endpoint + if leftIndices.contains(a) || leftIndices.contains(b) { + return .blue + } + if rightIndices.contains(a) || rightIndices.contains(b) { + return .red + } + return .green +} + +func keypointColor(for index: Int) -> Color { + if leftIndices.contains(index) { return .blue } + if rightIndices.contains(index) { return .red } + return .green +} + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .high + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Pose Estimator + +class PoseEstimator: ObservableObject { + @Published var keypoints: [Keypoint] = [] + @Published var fps: Double = 0 + @Published var detectedKeypointCount: Int = 0 + @Published var errorMessage: String? + + private var mlModel: MLModel? + private var isProcessing = false + private var lastTimestamp: CFTimeInterval = 0 + private var frameCount: Int = 0 + private let fpsUpdateInterval: CFTimeInterval = 0.5 + + private let confidenceThreshold: Float = 0.3 + private let smoothingFactor: CGFloat = 0.6 + private var previousKeypoints: [Keypoint] = [] + + // Model input dimensions + private let inputWidth = 192 + private let inputHeight = 256 + + // SimCC output dimensions (typically 2x input + some margin) + // For RTMPose with SimCC: x_simcc has shape (1, 17, 384), y_simcc has shape (1, 17, 512) + private let simccXSize = 384 // inputWidth * 2 + private let simccYSize = 512 // inputHeight * 2 + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add DWPose.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Convert using: python conversion_scripts/convert_dwpose.py + // Then drag DWPose.mlpackage into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "DWPose", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add DWPose.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + mlModel = try MLModel(contentsOf: modelURL, configuration: config) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func estimatePose(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let model = mlModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + // Update FPS counter + let now = CACurrentMediaTime() + frameCount += 1 + if now - lastTimestamp >= fpsUpdateInterval { + let currentFPS = Double(frameCount) / (now - lastTimestamp) + frameCount = 0 + lastTimestamp = now + DispatchQueue.main.async { + self.fps = currentFPS + } + } + + // Preprocess: resize pixel buffer and create MLMultiArray input + guard let resizedBuffer = resizePixelBuffer(pixelBuffer, width: inputWidth, height: inputHeight) else { + isProcessing = false + return + } + + do { + let input = try createModelInput(from: resizedBuffer) + let output = try model.prediction(from: input) + let keypoints = postProcessSimCC(output: output) + + // Apply temporal smoothing + let smoothed = applySmoothingFilter(keypoints) + + let detected = smoothed.filter { $0.confidence >= confidenceThreshold }.count + DispatchQueue.main.async { + self.keypoints = smoothed + self.detectedKeypointCount = detected + self.previousKeypoints = smoothed + } + } catch { + // Silently skip frames with errors during inference + } + + isProcessing = false + } + + private func resizePixelBuffer(_ pixelBuffer: CVPixelBuffer, width: Int, height: Int) -> CVPixelBuffer? { + var resizedBuffer: CVPixelBuffer? + let attrs = [ + kCVPixelBufferCGImageCompatibilityKey: true, + kCVPixelBufferCGBitmapContextCompatibilityKey: true + ] as CFDictionary + + let status = CVPixelBufferCreate( + kCFAllocatorDefault, + width, height, + kCVPixelFormatType_32BGRA, + attrs, + &resizedBuffer + ) + guard status == kCVReturnSuccess, let outputBuffer = resizedBuffer else { return nil } + + CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) + CVPixelBufferLockBaseAddress(outputBuffer, []) + + guard let srcData = CVPixelBufferGetBaseAddress(pixelBuffer), + let dstData = CVPixelBufferGetBaseAddress(outputBuffer) else { + CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) + CVPixelBufferUnlockBaseAddress(outputBuffer, []) + return nil + } + + var srcBuffer = vImage_Buffer( + data: srcData, + height: vImagePixelCount(CVPixelBufferGetHeight(pixelBuffer)), + width: vImagePixelCount(CVPixelBufferGetWidth(pixelBuffer)), + rowBytes: CVPixelBufferGetBytesPerRow(pixelBuffer) + ) + var dstBuffer = vImage_Buffer( + data: dstData, + height: vImagePixelCount(height), + width: vImagePixelCount(width), + rowBytes: CVPixelBufferGetBytesPerRow(outputBuffer) + ) + + vImageScale_ARGB8888(&srcBuffer, &dstBuffer, nil, vImage_Flags(kvImageHighQualityResampling)) + + CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) + CVPixelBufferUnlockBaseAddress(outputBuffer, []) + + return outputBuffer + } + + private func createModelInput(from pixelBuffer: CVPixelBuffer) throws -> MLDictionaryFeatureProvider { + // Create MLMultiArray with shape (1, 3, 256, 192) + let shape: [NSNumber] = [1, 3, NSNumber(value: inputHeight), NSNumber(value: inputWidth)] + let inputArray = try MLMultiArray(shape: shape, dataType: .float32) + + CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) + defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) } + + guard let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer) else { + throw NSError(domain: "DWPose", code: -1, userInfo: [NSLocalizedDescriptionKey: "Cannot access pixel buffer"]) + } + + let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer) + let ptr = baseAddress.assumingMemoryBound(to: UInt8.self) + + // ImageNet normalization: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + let mean: [Float] = [0.485, 0.456, 0.406] + let std: [Float] = [0.229, 0.224, 0.225] + + let channelStride = inputHeight * inputWidth + for y in 0.. [Keypoint] { + // RTMPose SimCC outputs: simcc_x (1, 17, simccXSize) and simcc_y (1, 17, simccYSize) + // Each row contains logits for discretized coordinate bins along X or Y axis + // The argmax of each row gives the predicted coordinate + + guard let simccX = output.featureValue(for: "simcc_x")?.multiArrayValue, + let simccY = output.featureValue(for: "simcc_y")?.multiArrayValue else { + // Fallback: try alternative output names + return postProcessHeatmap(output: output) + } + + let numKeypoints = 17 + var keypoints: [Keypoint] = [] + + let xDim = simccX.shape.last?.intValue ?? simccXSize + let yDim = simccY.shape.last?.intValue ?? simccYSize + + for k in 0.. maxXVal { + maxXVal = val + maxXIdx = i + } + } + + // Find argmax and max value for y coordinate + var maxYVal: Float = -Float.greatestFiniteMagnitude + var maxYIdx: Int = 0 + for i in 0.. maxYVal { + maxYVal = val + maxYIdx = i + } + } + + // Convert discretized coordinates back to normalized [0, 1] + let normX = CGFloat(maxXIdx) / CGFloat(xDim) + let normY = CGFloat(maxYIdx) / CGFloat(yDim) + + // Confidence is the average of softmax peaks + let confidence = (maxXVal + maxYVal) / 2.0 + + keypoints.append(Keypoint(x: normX, y: normY, confidence: confidence)) + } + + return keypoints + } + + private func postProcessHeatmap(output: MLFeatureProvider) -> [Keypoint] { + // Fallback heatmap-based post-processing + // Some models output standard heatmaps instead of SimCC + guard let featureNames = output.featureNames.first, + let heatmaps = output.featureValue(for: featureNames)?.multiArrayValue else { + return Array(repeating: Keypoint(x: 0, y: 0, confidence: 0), count: 17) + } + + let numKeypoints = 17 + let heatmapH = heatmaps.shape[2].intValue + let heatmapW = heatmaps.shape[3].intValue + var keypoints: [Keypoint] = [] + + for k in 0.. maxVal { + maxVal = val + maxRow = row + maxCol = col + } + } + } + + let normX = CGFloat(maxCol) / CGFloat(heatmapW) + let normY = CGFloat(maxRow) / CGFloat(heatmapH) + + keypoints.append(Keypoint(x: normX, y: normY, confidence: maxVal)) + } + + return keypoints + } + + private func applySmoothingFilter(_ current: [Keypoint]) -> [Keypoint] { + guard previousKeypoints.count == current.count else { return current } + + return zip(current, previousKeypoints).map { (cur, prev) in + // Only smooth if both frames have sufficient confidence + if cur.confidence >= confidenceThreshold && prev.confidence >= confidenceThreshold { + let smoothX = cur.x * (1.0 - smoothingFactor) + prev.x * smoothingFactor + let smoothY = cur.y * (1.0 - smoothingFactor) + prev.y * smoothingFactor + return Keypoint(x: smoothX, y: smoothY, confidence: cur.confidence) + } + return cur + } + } +} + +// MARK: - Skeleton Overlay + +struct SkeletonOverlay: View { + let keypoints: [Keypoint] + let geometrySize: CGSize + let confidenceThreshold: Float + + var body: some View { + Canvas { context, size in + // Draw skeleton connections + for connection in skeletonConnections { + let (startIdx, endIdx) = connection + guard startIdx < keypoints.count, endIdx < keypoints.count else { continue } + + let startKp = keypoints[startIdx] + let endKp = keypoints[endIdx] + + guard startKp.confidence >= confidenceThreshold, + endKp.confidence >= confidenceThreshold else { continue } + + let startPoint = CGPoint( + x: startKp.x * size.width, + y: startKp.y * size.height + ) + let endPoint = CGPoint( + x: endKp.x * size.width, + y: endKp.y * size.height + ) + + var path = Path() + path.move(to: startPoint) + path.addLine(to: endPoint) + + let color = connectionColor(for: connection) + context.stroke(path, with: .color(color), lineWidth: 3.0) + } + + // Draw keypoint dots + for (index, kp) in keypoints.enumerated() { + guard kp.confidence >= confidenceThreshold else { continue } + + let point = CGPoint( + x: kp.x * size.width, + y: kp.y * size.height + ) + + let dotSize: CGFloat = 8.0 + let rect = CGRect( + x: point.x - dotSize / 2, + y: point.y - dotSize / 2, + width: dotSize, + height: dotSize + ) + + let color = keypointColor(for: index) + + // White border + let borderRect = CGRect( + x: point.x - (dotSize + 2) / 2, + y: point.y - (dotSize + 2) / 2, + width: dotSize + 2, + height: dotSize + 2 + ) + context.fill(Path(ellipseIn: borderRect), with: .color(.white)) + context.fill(Path(ellipseIn: rect), with: .color(color)) + } + } + } +} + +// MARK: - FPS Counter View + +struct FPSCounterView: View { + let fps: Double + + var body: some View { + HStack(spacing: 4) { + Circle() + .fill(fps > 20 ? Color.green : (fps > 10 ? Color.yellow : Color.red)) + .frame(width: 8, height: 8) + Text(String(format: "%.1f FPS", fps)) + .font(.system(size: 13, weight: .bold, design: .monospaced)) + .foregroundColor(.white) + } + .padding(.horizontal, 10) + .padding(.vertical, 5) + .background(Color.black.opacity(0.6)) + .cornerRadius(8) + } +} + +// MARK: - Keypoint Count Badge + +struct KeypointCountBadge: View { + let count: Int + + var body: some View { + HStack(spacing: 4) { + Image(systemName: "figure.stand") + .font(.system(size: 11)) + .foregroundColor(.white) + Text("\(count)/17 keypoints") + .font(.system(size: 13, weight: .bold, design: .monospaced)) + .foregroundColor(.white) + } + .padding(.horizontal, 10) + .padding(.vertical, 5) + .background(Color.black.opacity(0.6)) + .cornerRadius(8) + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var camera = CameraManager() + @StateObject private var estimator = PoseEstimator() + + var body: some View { + ZStack { + // Camera feed + CameraPreview(session: camera.session) + .ignoresSafeArea() + + // Skeleton overlay + GeometryReader { geometry in + SkeletonOverlay( + keypoints: estimator.keypoints, + geometrySize: geometry.size, + confidenceThreshold: 0.3 + ) + } + .ignoresSafeArea() + + VStack { + // Top bar: FPS and keypoint count + HStack { + FPSCounterView(fps: estimator.fps) + Spacer() + KeypointCountBadge(count: estimator.detectedKeypointCount) + } + .padding(.horizontal, 16) + .padding(.top, 8) + + Spacer() + + // Error message if model not loaded + if let error = estimator.errorMessage { + VStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .font(.largeTitle) + .foregroundColor(.yellow) + Text(error) + .font(.caption) + .multilineTextAlignment(.center) + .padding(.horizontal) + } + .padding() + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) + .padding() + } + + // Legend at the bottom + HStack(spacing: 16) { + LegendItem(color: .blue, label: "Left") + LegendItem(color: .red, label: "Right") + LegendItem(color: .green, label: "Center") + } + .padding(.horizontal, 16) + .padding(.vertical, 8) + .background(Color.black.opacity(0.6)) + .cornerRadius(12) + .padding(.bottom, 8) + } + } + .onAppear { + camera.onFrame = { [weak estimator] buffer in + estimator?.estimatePose(sampleBuffer: buffer) + } + camera.configure() + } + .onDisappear { + camera.stop() + } + } +} + +// MARK: - Legend Item + +struct LegendItem: View { + let color: Color + let label: String + + var body: some View { + HStack(spacing: 4) { + Circle() + .fill(color) + .frame(width: 10, height: 10) + Text(label) + .font(.system(size: 12, weight: .medium)) + .foregroundColor(.white) + } + } +} + +// MARK: - Preview + +#Preview { + ContentView() +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift b/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift new file mode 100644 index 0000000..0cf97fa --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct DWPoseDemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist b/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist new file mode 100644 index 0000000..ae3c071 --- /dev/null +++ b/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time pose estimation. + + diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..fe04768 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj @@ -0,0 +1,340 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */; }; + DA0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000002 /* ContentView.swift */; }; + DA0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthAnythingV2DemoApp.swift; sourceTree = ""; }; + DA0000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + DA0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + DA0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + DA0000020000000000000010 /* DepthAnythingV2Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthAnythingV2Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + DA0000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + DA0000040000000000000001 = { + isa = PBXGroup; + children = ( + DA0000040000000000000002 /* DepthAnythingV2Demo */, + DA0000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + DA0000040000000000000002 /* DepthAnythingV2Demo */ = { + isa = PBXGroup; + children = ( + DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */, + DA0000020000000000000002 /* ContentView.swift */, + DA0000020000000000000004 /* Assets.xcassets */, + DA0000020000000000000005 /* Info.plist */, + ); + path = DepthAnythingV2Demo; + sourceTree = ""; + }; + DA0000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + DA0000020000000000000010 /* DepthAnythingV2Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + DA0000050000000000000001 /* DepthAnythingV2Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = DA0000070000000000000001 /* Build configuration list for PBXNativeTarget "DepthAnythingV2Demo" */; + buildPhases = ( + DA0000060000000000000001 /* Sources */, + DA0000030000000000000001 /* Frameworks */, + DA0000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = DepthAnythingV2Demo; + productName = DepthAnythingV2Demo; + productReference = DA0000020000000000000010 /* DepthAnythingV2Demo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + DA0000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + DA0000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = DA0000070000000000000003 /* Build configuration list for PBXProject "DepthAnythingV2Demo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = DA0000040000000000000001; + productRefGroup = DA0000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + DA0000050000000000000001 /* DepthAnythingV2Demo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + DA0000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DA0000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + DA0000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */, + DA0000010000000000000002 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + DA0000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + DA0000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + DA0000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DepthAnythingV2Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time depth estimation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthanythingv2"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + DA0000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = DepthAnythingV2Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time depth estimation."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthanythingv2"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + DA0000070000000000000001 /* Build configuration list for PBXNativeTarget "DepthAnythingV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DA0000090000000000000003 /* Debug */, + DA0000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + DA0000070000000000000003 /* Build configuration list for PBXProject "DepthAnythingV2Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + DA0000090000000000000001 /* Debug */, + DA0000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = DA0000080000000000000001 /* Project object */; +} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift new file mode 100644 index 0000000..44745b2 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift @@ -0,0 +1,438 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Vision +import Accelerate + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .high + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Depth Estimator + +class DepthEstimator: ObservableObject { + @Published var depthImage: UIImage? + @Published var errorMessage: String? + @Published var minDepth: Float = 0 + @Published var maxDepth: Float = 0 + + private var vnModel: VNCoreMLModel? + private var isProcessing = false + + /// Width and height of the model output depth map. + private let depthSize = 518 + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add DepthAnythingV2Small.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Download from the CoreML-Models repository and drag into Xcode. + + guard let modelURL = Bundle.main.url(forResource: "DepthAnythingV2Small", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add DepthAnythingV2Small.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func estimateDepth(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + defer { self?.isProcessing = false } + + guard let self = self else { return } + + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self.processDepthOutput(multiArray: multiArray) + } + } + request.imageCropAndScaleOption = .scaleFill + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + } + + private func processDepthOutput(multiArray: MLMultiArray) { + let count = multiArray.count + let size = depthSize + + // Extract raw depth values + var depths = [Float](repeating: 0, count: count) + let ptr = multiArray.dataPointer.bindMemory(to: Float.self, capacity: count) + for i in 0.. 0 else { + return + } + + // Create RGBA pixel data with a color gradient + var pixelData = [UInt8](repeating: 255, count: size * size * 4) + + for i in 0.. (UInt8, UInt8, UInt8) { + // Turbo-inspired colormap: near = warm (red), far = cool (blue) + let t = max(0, min(1, value)) + + let r: Float + let g: Float + let b: Float + + if t < 0.25 { + // Red -> Yellow + let s = t / 0.25 + r = 1.0 + g = s + b = 0.0 + } else if t < 0.5 { + // Yellow -> Green + let s = (t - 0.25) / 0.25 + r = 1.0 - s + g = 1.0 + b = 0.0 + } else if t < 0.75 { + // Green -> Cyan + let s = (t - 0.5) / 0.25 + r = 0.0 + g = 1.0 + b = s + } else { + // Cyan -> Blue + let s = (t - 0.75) / 0.25 + r = 0.0 + g = 1.0 - s + b = 1.0 + } + + return ( + UInt8(r * 255), + UInt8(g * 255), + UInt8(b * 255) + ) + } + + /// Creates a fully opaque version of the depth map for full-screen display. + func opaqueDepthImage() -> UIImage? { + guard let cgImage = depthImage?.cgImage else { return nil } + let width = cgImage.width + let height = cgImage.height + + guard let context = CGContext( + data: nil, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) else { return nil } + + // Draw black background then the image on top + context.setFillColor(UIColor.black.cgColor) + context.fill(CGRect(x: 0, y: 0, width: width, height: height)) + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) + + guard let result = context.makeImage() else { return nil } + return UIImage(cgImage: result) + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var camera = CameraManager() + @StateObject private var depthEstimator = DepthEstimator() + @State private var showFullDepthMap = false + + var body: some View { + ZStack { + // Camera feed (hidden when full depth map is shown) + if !showFullDepthMap { + CameraPreview(session: camera.session) + .ignoresSafeArea() + } else { + Color.black + .ignoresSafeArea() + } + + // Depth map overlay or full-screen depth map + if let depthImg = depthEstimator.depthImage { + if showFullDepthMap { + if let opaqueImg = depthEstimator.opaqueDepthImage() { + Image(uiImage: opaqueImg) + .resizable() + .aspectRatio(contentMode: .fill) + .ignoresSafeArea() + } + } else { + Image(uiImage: depthImg) + .resizable() + .aspectRatio(contentMode: .fill) + .ignoresSafeArea() + } + } + + // UI controls + VStack { + // Top bar with title and toggle + HStack { + Text("Depth Anything V2") + .font(.headline) + .foregroundColor(.white) + + Spacer() + + Button(action: { + withAnimation(.easeInOut(duration: 0.3)) { + showFullDepthMap.toggle() + } + }) { + HStack(spacing: 4) { + Image(systemName: showFullDepthMap ? "camera.fill" : "square.stack.3d.up.fill") + .font(.body) + Text(showFullDepthMap ? "Camera" : "Depth") + .font(.caption) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(.ultraThinMaterial, in: Capsule()) + .foregroundColor(.white) + } + } + .padding(.horizontal) + .padding(.top, 8) + + Spacer() + + // Error message if model not loaded + if let error = depthEstimator.errorMessage { + VStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .font(.largeTitle) + .foregroundColor(.yellow) + Text(error) + .font(.caption) + .multilineTextAlignment(.center) + .padding(.horizontal) + } + .padding() + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) + .padding() + } + + // Depth info overlay + if depthEstimator.depthImage != nil { + VStack(spacing: 8) { + // Color legend + HStack(spacing: 0) { + Text("Near") + .font(.caption2) + .foregroundColor(.white) + Spacer() + + // Gradient bar + LinearGradient( + gradient: Gradient(colors: [.red, .yellow, .green, .cyan, .blue]), + startPoint: .leading, + endPoint: .trailing + ) + .frame(height: 8) + .cornerRadius(4) + .padding(.horizontal, 8) + + Spacer() + Text("Far") + .font(.caption2) + .foregroundColor(.white) + } + + // Depth statistics + HStack { + Label { + Text(String(format: "Min: %.2f", depthEstimator.minDepth)) + .font(.system(.caption2, design: .monospaced)) + } icon: { + Image(systemName: "arrow.down.circle.fill") + .foregroundColor(.red) + .font(.caption2) + } + + Spacer() + + Label { + Text(String(format: "Max: %.2f", depthEstimator.maxDepth)) + .font(.system(.caption2, design: .monospaced)) + } icon: { + Image(systemName: "arrow.up.circle.fill") + .foregroundColor(.blue) + .font(.caption2) + } + } + .foregroundColor(.white) + } + .padding() + .background(.black.opacity(0.7), in: RoundedRectangle(cornerRadius: 16)) + .padding() + } + } + } + .onAppear { + camera.onFrame = { [weak depthEstimator] buffer in + depthEstimator?.estimateDepth(sampleBuffer: buffer) + } + camera.configure() + } + .onDisappear { + camera.stop() + } + } +} + +#Preview { + ContentView() +} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift new file mode 100644 index 0000000..c245af6 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct DepthAnythingV2DemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist new file mode 100644 index 0000000..7fac6e7 --- /dev/null +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time depth estimation. + + diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj b/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj new file mode 100644 index 0000000..3f81c04 --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj @@ -0,0 +1,340 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + YV0000010000000000000001 /* YOLOv10DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000001 /* YOLOv10DemoApp.swift */; }; + YV0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000002 /* ContentView.swift */; }; + YV0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000004 /* Assets.xcassets */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + YV0000020000000000000001 /* YOLOv10DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = YOLOv10DemoApp.swift; sourceTree = ""; }; + YV0000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + YV0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + YV0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + YV0000020000000000000010 /* YOLOv10Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = YOLOv10Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + YV0000030000000000000001 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + YV0000040000000000000001 = { + isa = PBXGroup; + children = ( + YV0000040000000000000002 /* YOLOv10Demo */, + YV0000040000000000000003 /* Products */, + ); + sourceTree = ""; + }; + YV0000040000000000000002 /* YOLOv10Demo */ = { + isa = PBXGroup; + children = ( + YV0000020000000000000001 /* YOLOv10DemoApp.swift */, + YV0000020000000000000002 /* ContentView.swift */, + YV0000020000000000000004 /* Assets.xcassets */, + YV0000020000000000000005 /* Info.plist */, + ); + path = YOLOv10Demo; + sourceTree = ""; + }; + YV0000040000000000000003 /* Products */ = { + isa = PBXGroup; + children = ( + YV0000020000000000000010 /* YOLOv10Demo.app */, + ); + name = Products; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + YV0000050000000000000001 /* YOLOv10Demo */ = { + isa = PBXNativeTarget; + buildConfigurationList = YV0000070000000000000001 /* Build configuration list for PBXNativeTarget "YOLOv10Demo" */; + buildPhases = ( + YV0000060000000000000001 /* Sources */, + YV0000030000000000000001 /* Frameworks */, + YV0000060000000000000002 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = YOLOv10Demo; + productName = YOLOv10Demo; + productReference = YV0000020000000000000010 /* YOLOv10Demo.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + YV0000080000000000000001 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + YV0000050000000000000001 = { + CreatedOnToolsVersion = 15.0; + }; + }; + }; + buildConfigurationList = YV0000070000000000000003 /* Build configuration list for PBXProject "YOLOv10Demo" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = YV0000040000000000000001; + productRefGroup = YV0000040000000000000003 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + YV0000050000000000000001 /* YOLOv10Demo */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + YV0000060000000000000002 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + YV0000010000000000000004 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + YV0000060000000000000001 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + YV0000010000000000000001 /* YOLOv10DemoApp.swift in Sources */, + YV0000010000000000000002 /* ContentView.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + YV0000090000000000000001 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + YV0000090000000000000002 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + YV0000090000000000000003 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = YOLOv10Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time object detection."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.yolov10"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + YV0000090000000000000004 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = YOLOv10Demo/Info.plist; + INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time object detection."; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.yolov10"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + YV0000070000000000000001 /* Build configuration list for PBXNativeTarget "YOLOv10Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + YV0000090000000000000003 /* Debug */, + YV0000090000000000000004 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + YV0000070000000000000003 /* Build configuration list for PBXProject "YOLOv10Demo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + YV0000090000000000000001 /* Debug */, + YV0000090000000000000002 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + + }; + rootObject = YV0000080000000000000001 /* Project object */; +} diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000..eb87897 --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/Contents.json b/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/Contents.json new file mode 100644 index 0000000..73c0059 --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift b/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift new file mode 100644 index 0000000..f16a8c4 --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift @@ -0,0 +1,452 @@ +import SwiftUI +import UIKit +import AVFoundation +import CoreML +import Vision + +// MARK: - COCO Class Labels + +let cocoClassLabels: [Int: String] = [ + 0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane", + 5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light", + 10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench", + 14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow", + 20: "elephant", 21: "bear", 22: "zebra", 23: "giraffe", 24: "backpack", + 25: "umbrella", 26: "handbag", 27: "tie", 28: "suitcase", 29: "frisbee", + 30: "skis", 31: "snowboard", 32: "sports ball", 33: "kite", + 34: "baseball bat", 35: "baseball glove", 36: "skateboard", 37: "surfboard", + 38: "tennis racket", 39: "bottle", 40: "wine glass", 41: "cup", 42: "fork", + 43: "knife", 44: "spoon", 45: "bowl", 46: "banana", 47: "apple", + 48: "sandwich", 49: "orange", 50: "broccoli", 51: "carrot", 52: "hot dog", + 53: "pizza", 54: "donut", 55: "cake", 56: "chair", 57: "couch", + 58: "potted plant", 59: "bed", 60: "dining table", 61: "toilet", 62: "tv", + 63: "laptop", 64: "mouse", 65: "remote", 66: "keyboard", 67: "cell phone", + 68: "microwave", 69: "oven", 70: "toaster", 71: "sink", 72: "refrigerator", + 73: "book", 74: "clock", 75: "vase", 76: "scissors", 77: "teddy bear", + 78: "hair drier", 79: "toothbrush" +] + +// MARK: - Color Palette for Classes + +struct ClassColors { + static let palette: [Color] = [ + .red, .green, .blue, .orange, .purple, + .pink, .yellow, .cyan, .mint, .indigo, + .teal, .brown, Color(red: 1, green: 0.4, blue: 0.4), + Color(red: 0.4, green: 1, blue: 0.4), + Color(red: 0.4, green: 0.4, blue: 1), + Color(red: 1, green: 0.8, blue: 0), Color(red: 0, green: 0.8, blue: 0.8), + Color(red: 0.8, green: 0, blue: 0.8), Color(red: 0.6, green: 0.4, blue: 0.2), + Color(red: 0.2, green: 0.6, blue: 0.4) + ] + + static func color(for classIndex: Int) -> Color { + palette[classIndex % palette.count] + } +} + +// MARK: - Detection Result + +struct Detection: Identifiable { + let id = UUID() + let classIndex: Int + let label: String + let confidence: Float + let boundingBox: CGRect // Normalized coordinates (0..1), Vision convention (origin bottom-left) +} + +// MARK: - Camera Manager + +class CameraManager: NSObject, ObservableObject { + let session = AVCaptureSession() + var onFrame: ((CMSampleBuffer) -> Void)? + + private let sessionQueue = DispatchQueue(label: "camera.session") + + func configure() { + sessionQueue.async { [weak self] in + self?.setupSession() + } + } + + private func setupSession() { + session.beginConfiguration() + session.sessionPreset = .high + + guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), + let input = try? AVCaptureDeviceInput(device: device) else { + session.commitConfiguration() + return + } + + if session.canAddInput(input) { + session.addInput(input) + } + + let output = AVCaptureVideoDataOutput() + output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) + output.alwaysDiscardsLateVideoFrames = true + + if session.canAddOutput(output) { + session.addOutput(output) + } + + session.commitConfiguration() + session.startRunning() + } + + func stop() { + sessionQueue.async { [weak self] in + self?.session.stopRunning() + } + } +} + +extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + onFrame?(sampleBuffer) + } +} + +// MARK: - Camera Preview + +struct CameraPreview: UIViewRepresentable { + let session: AVCaptureSession + + func makeUIView(context: Context) -> UIView { + let view = UIView(frame: .zero) + let previewLayer = AVCaptureVideoPreviewLayer(session: session) + previewLayer.videoGravity = .resizeAspectFill + view.layer.addSublayer(previewLayer) + context.coordinator.previewLayer = previewLayer + return view + } + + func updateUIView(_ uiView: UIView, context: Context) { + context.coordinator.previewLayer?.frame = uiView.bounds + } + + func makeCoordinator() -> Coordinator { + Coordinator() + } + + class Coordinator { + var previewLayer: AVCaptureVideoPreviewLayer? + } +} + +// MARK: - Object Detector + +class ObjectDetector: ObservableObject { + @Published var detections: [Detection] = [] + @Published var fps: Double = 0 + @Published var errorMessage: String? + + private var vnModel: VNCoreMLModel? + private var isProcessing = false + private var lastTimestamp: CFTimeInterval = 0 + private var frameCount: Int = 0 + private let fpsUpdateInterval: CFTimeInterval = 0.5 + + private let confidenceThreshold: Float = 0.25 + + init() { + loadModel() + } + + private func loadModel() { + // PLACEHOLDER: Add YOLOv10N.mlpackage to the Xcode project. + // The compiled .mlmodelc will be bundled automatically. + // Convert using: python conversion_scripts/convert_yolov10.py + // Then drag yolov10n.mlpackage into Xcode and rename to YOLOv10N. + + guard let modelURL = Bundle.main.url(forResource: "YOLOv10N", withExtension: "mlmodelc") else { + DispatchQueue.main.async { + self.errorMessage = "Model not found. Please add YOLOv10N.mlpackage to the Xcode project." + } + return + } + + do { + let config = MLModelConfiguration() + config.computeUnits = .all + let mlModel = try MLModel(contentsOf: modelURL, configuration: config) + vnModel = try VNCoreMLModel(for: mlModel) + } catch { + DispatchQueue.main.async { + self.errorMessage = "Failed to load model: \(error.localizedDescription)" + } + } + } + + func detect(sampleBuffer: CMSampleBuffer) { + guard !isProcessing, let vnModel = vnModel else { return } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + isProcessing = true + + // Update FPS counter + let now = CACurrentMediaTime() + frameCount += 1 + if now - lastTimestamp >= fpsUpdateInterval { + let currentFPS = Double(frameCount) / (now - lastTimestamp) + frameCount = 0 + lastTimestamp = now + DispatchQueue.main.async { + self.fps = currentFPS + } + } + + let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in + defer { self?.isProcessing = false } + guard let self = self else { return } + + if let results = request.results as? [VNRecognizedObjectObservation] { + self.processRecognizedObjects(results) + } else { + DispatchQueue.main.async { + self.detections = [] + } + } + } + request.imageCropAndScaleOption = .scaleFill + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) + try? handler.perform([request]) + } + + private func processRecognizedObjects(_ observations: [VNRecognizedObjectObservation]) { + let filtered = observations.filter { $0.confidence >= confidenceThreshold } + + let results: [Detection] = filtered.compactMap { observation in + guard let topLabel = observation.labels.first else { return nil } + + // Attempt to find COCO class index from label identifier + let classIndex = cocoClassLabels.first(where: { $0.value == topLabel.identifier })?.key ?? 0 + + return Detection( + classIndex: classIndex, + label: topLabel.identifier, + confidence: topLabel.confidence, + boundingBox: observation.boundingBox + ) + } + + DispatchQueue.main.async { + self.detections = results + } + } +} + +// MARK: - Bounding Box Overlay + +struct BoundingBoxOverlay: View { + let detections: [Detection] + let geometrySize: CGSize + + var body: some View { + ForEach(detections) { detection in + let rect = convertBoundingBox(detection.boundingBox, in: geometrySize) + let boxColor = ClassColors.color(for: detection.classIndex) + + ZStack(alignment: .topLeading) { + // Bounding box rectangle + Rectangle() + .stroke(boxColor, lineWidth: 2.5) + .frame(width: rect.width, height: rect.height) + .position(x: rect.midX, y: rect.midY) + + // Label background and text + Text("\(detection.label) \(String(format: "%.0f%%", detection.confidence * 100))") + .font(.system(size: 11, weight: .semibold, design: .monospaced)) + .foregroundColor(.white) + .padding(.horizontal, 4) + .padding(.vertical, 2) + .background(boxColor.opacity(0.85)) + .cornerRadius(4) + .position(x: rect.minX + 40, y: rect.minY - 10) + } + } + } + + /// Convert Vision normalized coordinates (origin bottom-left) to UIKit coordinates (origin top-left). + private func convertBoundingBox(_ box: CGRect, in size: CGSize) -> CGRect { + let x = box.origin.x * size.width + let y = (1 - box.origin.y - box.height) * size.height + let width = box.width * size.width + let height = box.height * size.height + return CGRect(x: x, y: y, width: width, height: height) + } +} + +// MARK: - FPS Counter View + +struct FPSCounterView: View { + let fps: Double + + var body: some View { + HStack(spacing: 4) { + Circle() + .fill(fps > 20 ? Color.green : (fps > 10 ? Color.yellow : Color.red)) + .frame(width: 8, height: 8) + Text(String(format: "%.1f FPS", fps)) + .font(.system(size: 13, weight: .bold, design: .monospaced)) + .foregroundColor(.white) + } + .padding(.horizontal, 10) + .padding(.vertical, 5) + .background(Color.black.opacity(0.6)) + .cornerRadius(8) + } +} + +// MARK: - Detection Count Badge + +struct DetectionCountBadge: View { + let count: Int + + var body: some View { + HStack(spacing: 4) { + Image(systemName: "eye.fill") + .font(.system(size: 11)) + .foregroundColor(.white) + Text("\(count) object\(count == 1 ? "" : "s")") + .font(.system(size: 13, weight: .bold, design: .monospaced)) + .foregroundColor(.white) + } + .padding(.horizontal, 10) + .padding(.vertical, 5) + .background(Color.black.opacity(0.6)) + .cornerRadius(8) + } +} + +// MARK: - Content View + +struct ContentView: View { + @StateObject private var camera = CameraManager() + @StateObject private var detector = ObjectDetector() + + var body: some View { + ZStack { + // Camera feed + CameraPreview(session: camera.session) + .ignoresSafeArea() + + // Bounding box overlay + GeometryReader { geometry in + BoundingBoxOverlay( + detections: detector.detections, + geometrySize: geometry.size + ) + } + .ignoresSafeArea() + + VStack { + // Top bar: FPS and detection count + HStack { + FPSCounterView(fps: detector.fps) + Spacer() + DetectionCountBadge(count: detector.detections.count) + } + .padding(.horizontal, 16) + .padding(.top, 8) + + Spacer() + + // Error message if model not loaded + if let error = detector.errorMessage { + VStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .font(.largeTitle) + .foregroundColor(.yellow) + Text(error) + .font(.caption) + .multilineTextAlignment(.center) + .padding(.horizontal) + } + .padding() + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) + .padding() + } + + // Detection list overlay at the bottom + if !detector.detections.isEmpty { + VStack(alignment: .leading, spacing: 6) { + Text("YOLOv10-N Detections") + .font(.headline) + .foregroundColor(.white) + + let grouped = groupedDetections(detector.detections) + ForEach(Array(grouped.prefix(5).enumerated()), id: \.offset) { _, item in + HStack { + Circle() + .fill(ClassColors.color(for: item.classIndex)) + .frame(width: 10, height: 10) + Text(item.label) + .font(.system(.body, design: .monospaced)) + .foregroundColor(.white) + if item.count > 1 { + Text("x\(item.count)") + .font(.system(.caption, design: .monospaced)) + .foregroundColor(.white.opacity(0.7)) + } + Spacer() + Text(String(format: "%.0f%%", item.maxConfidence * 100)) + .font(.system(.body, design: .monospaced)) + .foregroundColor(.green) + } + } + } + .padding() + .background(.black.opacity(0.7), in: RoundedRectangle(cornerRadius: 16)) + .padding(.horizontal) + .padding(.bottom, 8) + } + } + } + .onAppear { + camera.onFrame = { [weak detector] buffer in + detector?.detect(sampleBuffer: buffer) + } + camera.configure() + } + .onDisappear { + camera.stop() + } + } + + // Group detections by class for the summary panel + private func groupedDetections(_ detections: [Detection]) -> [GroupedDetection] { + var dict: [String: GroupedDetection] = [:] + for d in detections { + if var existing = dict[d.label] { + existing.count += 1 + existing.maxConfidence = max(existing.maxConfidence, d.confidence) + dict[d.label] = existing + } else { + dict[d.label] = GroupedDetection( + classIndex: d.classIndex, + label: d.label, + count: 1, + maxConfidence: d.confidence + ) + } + } + return dict.values.sorted { $0.maxConfidence > $1.maxConfidence } + } +} + +// MARK: - Grouped Detection + +struct GroupedDetection { + let classIndex: Int + let label: String + var count: Int + var maxConfidence: Float +} + +// MARK: - Preview + +#Preview { + ContentView() +} diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/Info.plist b/sample_apps/YOLOv10Demo/YOLOv10Demo/Info.plist new file mode 100644 index 0000000..5e0e57c --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/Info.plist @@ -0,0 +1,8 @@ + + + + + NSCameraUsageDescription + This app needs camera access for real-time object detection. + + diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/YOLOv10DemoApp.swift b/sample_apps/YOLOv10Demo/YOLOv10Demo/YOLOv10DemoApp.swift new file mode 100644 index 0000000..7af8f2d --- /dev/null +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/YOLOv10DemoApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct YOLOv10DemoApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} From 040d4731fc886bbb8fddaa4b508fe1e3fd2bfc76 Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 02:21:22 +0900 Subject: [PATCH 06/18] Add mlpackage references to Xcode projects for local testing Models are distributed via GitHub Releases (not in repo). Download .mlpackage files and place in the app directory to build. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../BiRefNetDemo.xcodeproj/project.pbxproj | 4 ++ .../DepthProDemo.xcodeproj/project.pbxproj | 39 +++++++++++-------- .../KokoroDemo.xcodeproj/project.pbxproj | 4 ++ .../SmolVLMDemo.xcodeproj/project.pbxproj | 4 ++ .../WhisperDemo.xcodeproj/project.pbxproj | 4 ++ .../YOLOEDemo.xcodeproj/project.pbxproj | 4 ++ .../project.pbxproj | 4 ++ .../YOLOv10Demo.xcodeproj/project.pbxproj | 4 ++ 8 files changed, 50 insertions(+), 17 deletions(-) diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj index 2f2013f..95a56c9 100644 --- a/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ BR0001 /* BiRefNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0002; }; BR0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0004; }; BR0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = BR0006; }; + BRML02 /* BiRefNet.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = BRML01 /* BiRefNet.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ BR0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; BR0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; BR0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + BRML01 /* BiRefNet.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = BiRefNet.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ BR0004 /* ContentView.swift */, BR0006 /* Assets.xcassets */, BR0008 /* Info.plist */, + BRML01 /* BiRefNet.mlpackage */, ); path = BiRefNetDemo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( BR0001 /* BiRefNetDemoApp.swift in Sources */, BR0003 /* ContentView.swift in Sources */, + BRML02 /* BiRefNet.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj b/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj index 234a9c3..f9d998a 100644 --- a/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj +++ b/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj @@ -7,17 +7,19 @@ objects = { /* Begin PBXBuildFile section */ - DP0001 /* DepthProDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0002; }; - DP0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0004; }; - DP0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DP0006; }; + DP0001 /* DepthProDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0002 /* DepthProDemoApp.swift */; }; + DP0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0004 /* ContentView.swift */; }; + DP0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DP0006 /* Assets.xcassets */; }; + DPML02 /* DepthPro.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = DPML01 /* DepthPro.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - DP0007 /* DepthProDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthProDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; DP0002 /* DepthProDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthProDemoApp.swift; sourceTree = ""; }; DP0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; DP0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + DP0007 /* DepthProDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthProDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; DP0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + DPML01 /* DepthPro.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = DepthPro.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ DP0004 /* ContentView.swift */, DP0006 /* Assets.xcassets */, DP0008 /* Info.plist */, + DPML01 /* DepthPro.mlpackage */, ); path = DepthProDemo; sourceTree = ""; @@ -63,7 +66,7 @@ /* Begin PBXNativeTarget section */ DP0013 /* DepthProDemo */ = { isa = PBXNativeTarget; - buildConfigurationList = DP0014; + buildConfigurationList = DP0014 /* Build configuration list for PBXNativeTarget "DepthProDemo" */; buildPhases = ( DP0015 /* Sources */, DP0009 /* Frameworks */, @@ -75,7 +78,7 @@ ); name = DepthProDemo; productName = DepthProDemo; - productReference = DP0007; + productReference = DP0007 /* DepthProDemo.app */; productType = "com.apple.product-type.application"; }; /* End PBXNativeTarget section */ @@ -93,7 +96,7 @@ }; }; }; - buildConfigurationList = DP0018; + buildConfigurationList = DP0018 /* Build configuration list for PBXProject "DepthProDemo" */; compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; @@ -102,11 +105,11 @@ Base, ); mainGroup = DP0010; - productRefGroup = DP0012; + productRefGroup = DP0012 /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( - DP0013, + DP0013 /* DepthProDemo */, ); }; /* End PBXProject section */ @@ -129,6 +132,7 @@ files = ( DP0001 /* DepthProDemoApp.swift in Sources */, DP0003 /* ContentView.swift in Sources */, + DPML02 /* DepthPro.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -195,6 +199,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = DepthProDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -222,6 +227,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = DepthProDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -245,26 +251,25 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - DP0018 /* Build configuration list for PBXProject */ = { + DP0014 /* Build configuration list for PBXNativeTarget "DepthProDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - DP0019, - DP0020, + DP0021 /* Debug */, + DP0022 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - DP0014 /* Build configuration list for PBXNativeTarget */ = { + DP0018 /* Build configuration list for PBXProject "DepthProDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - DP0021, - DP0022, + DP0019 /* Debug */, + DP0020 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; - rootObject = DP0017; + rootObject = DP0017 /* Project object */; } diff --git a/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj b/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj index e8656e1..9f18833 100644 --- a/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj +++ b/creative_apps/KokoroDemo/KokoroDemo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ KK0001 /* KokoroDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = KK0002; }; KK0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = KK0004; }; KK0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = KK0006; }; + KKML02 /* Kokoro82M.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = KKML01 /* Kokoro82M.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ KK0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; KK0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; KK0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + KKML01 /* Kokoro82M.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = Kokoro82M.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ KK0004 /* ContentView.swift */, KK0006 /* Assets.xcassets */, KK0008 /* Info.plist */, + KKML01 /* Kokoro82M.mlpackage */, ); path = KokoroDemo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( KK0001 /* KokoroDemoApp.swift in Sources */, KK0003 /* ContentView.swift in Sources */, + KKML02 /* Kokoro82M.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj b/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj index 02a5cf7..ec95256 100644 --- a/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj +++ b/creative_apps/SmolVLMDemo/SmolVLMDemo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ SV0001 /* SmolVLMDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = SV0002; }; SV0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = SV0004; }; SV0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = SV0006; }; + SVML02 /* SmolVLM2_VisionEncoder.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = SVML01 /* SmolVLM2_VisionEncoder.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ SV0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; SV0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; SV0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + SVML01 /* SmolVLM2_VisionEncoder.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = SmolVLM2_VisionEncoder.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ SV0004 /* ContentView.swift */, SV0006 /* Assets.xcassets */, SV0008 /* Info.plist */, + SVML01 /* SmolVLM2_VisionEncoder.mlpackage */, ); path = SmolVLMDemo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( SV0001 /* SmolVLMDemoApp.swift in Sources */, SV0003 /* ContentView.swift in Sources */, + SVML02 /* SmolVLM2_VisionEncoder.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj b/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj index ec226e6..b0f321f 100644 --- a/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj +++ b/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ WH0001 /* WhisperDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0002; }; WH0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0004; }; WH0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = WH0006; }; + WHML02 /* WhisperTinyEncoder.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = WHML01 /* WhisperTinyEncoder.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ WH0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; WH0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; WH0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + WHML01 /* WhisperTinyEncoder.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = WhisperTinyEncoder.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ WH0004 /* ContentView.swift */, WH0006 /* Assets.xcassets */, WH0008 /* Info.plist */, + WHML01 /* WhisperTinyEncoder.mlpackage */, ); path = WhisperDemo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( WH0001 /* WhisperDemoApp.swift in Sources */, WH0003 /* ContentView.swift in Sources */, + WHML02 /* WhisperTinyEncoder.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj b/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj index 18769dc..91ed641 100644 --- a/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj +++ b/creative_apps/YOLOEDemo/YOLOEDemo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ YE0001 /* YOLOEDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = YE0002; }; YE0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = YE0004; }; YE0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = YE0006; }; + YEML02 /* YOLOE_S.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = YEML01 /* YOLOE_S.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ YE0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; YE0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; YE0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + YEML01 /* YOLOE_S.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = YOLOE_S.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ YE0004 /* ContentView.swift */, YE0006 /* Assets.xcassets */, YE0008 /* Info.plist */, + YEML01 /* YOLOE_S.mlpackage */, ); path = YOLOEDemo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( YE0001 /* YOLOEDemoApp.swift in Sources */, YE0003 /* ContentView.swift in Sources */, + YEML02 /* YOLOE_S.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj index fe04768..cb0aa1f 100644 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj +++ b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */; }; DA0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000002 /* ContentView.swift */; }; DA0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000004 /* Assets.xcassets */; }; + DAML02 /* DepthAnythingV2Small.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = DAML01 /* DepthAnythingV2Small.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ DA0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; DA0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; DA0000020000000000000010 /* DepthAnythingV2Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthAnythingV2Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + DAML01 /* DepthAnythingV2Small.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = DepthAnythingV2Small.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ DA0000020000000000000002 /* ContentView.swift */, DA0000020000000000000004 /* Assets.xcassets */, DA0000020000000000000005 /* Info.plist */, + DAML01 /* DepthAnythingV2Small.mlpackage */, ); path = DepthAnythingV2Demo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */, DA0000010000000000000002 /* ContentView.swift in Sources */, + DAML02 /* DepthAnythingV2Small.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj b/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj index 3f81c04..c0c1496 100644 --- a/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ YV0000010000000000000001 /* YOLOv10DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000001 /* YOLOv10DemoApp.swift */; }; YV0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000002 /* ContentView.swift */; }; YV0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = YV0000020000000000000004 /* Assets.xcassets */; }; + YVML02 /* YOLOv10N.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = YVML01 /* YOLOv10N.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -18,6 +19,7 @@ YV0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; YV0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; YV0000020000000000000010 /* YOLOv10Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = YOLOv10Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + YVML01 /* YOLOv10N.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = YOLOv10N.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -46,6 +48,7 @@ YV0000020000000000000002 /* ContentView.swift */, YV0000020000000000000004 /* Assets.xcassets */, YV0000020000000000000005 /* Info.plist */, + YVML01 /* YOLOv10N.mlpackage */, ); path = YOLOv10Demo; sourceTree = ""; @@ -129,6 +132,7 @@ files = ( YV0000010000000000000001 /* YOLOv10DemoApp.swift in Sources */, YV0000010000000000000002 /* ContentView.swift in Sources */, + YVML02 /* YOLOv10N.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; From 0b203bfd58f633bd8060447f75b36d89c2905d87 Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 02:37:46 +0900 Subject: [PATCH 07/18] Add device compatibility warning for Depth Pro demo Depth Pro requires 1536x1536 fixed input (~1.2GB model). Added RAM requirement warning (iPhone 15 Pro+ / 6GB RAM). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../DepthProDemo/ContentView.swift | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift index 8244c85..7c0a855 100644 --- a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift +++ b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift @@ -7,9 +7,12 @@ import CoreMotion // MARK: - Apple Depth Pro - Metric Depth Estimation Demo // // Depth Pro produces metric (absolute) depth maps from a single image, -// along with an estimated focal length. Input: 1536x1536 RGB image. +// along with an estimated focal length. Input: 1536x1536 (fixed). // Outputs: depth map (meters) + focal length (pixels). // +// NOTE: Requires iPhone 15 Pro or later (6GB+ RAM). +// The model is ~1.2GB and processes 1536x1536 input. +// // Features: // - PhotosPicker for image selection // - Color-coded depth visualization (turbo colormap) @@ -102,6 +105,18 @@ struct ContentView: View { NavigationStack { ScrollView { VStack(spacing: 20) { + // Device compatibility warning + HStack(spacing: 8) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundColor(.orange) + Text("Requires iPhone 15 Pro+ (6GB RAM). May crash on older devices.") + .font(.caption) + .foregroundColor(.secondary) + } + .padding(10) + .background(Color.orange.opacity(0.1)) + .cornerRadius(8) + imageSelectionSection processSection progressSection @@ -671,7 +686,10 @@ class DepthProViewModel: ObservableObject { } let config = MLModelConfiguration() + // Use CPU + Neural Engine to minimize GPU memory pressure config.computeUnits = .cpuAndNeuralEngine + + await updateStatus("Loading model (requires 6GB+ RAM)...", progress: 0.2) let model = try MLModel(contentsOf: modelURL, configuration: config) await updateStatus("Preprocessing image...", progress: 0.3) @@ -680,7 +698,8 @@ class DepthProViewModel: ObservableObject { throw DepthProError.processingFailed("No image selected.") } - // Resize to 1536x1536 for model input + // Depth Pro requires exactly 1536x1536 input (ViT patch architecture constraint). + // Requires iPhone 15 Pro or later (6GB+ RAM). let targetSize = CGSize(width: 1536, height: 1536) guard let resizedImage = resizeImage(inputImage, to: targetSize), let pixelBuffer = pixelBufferFromImage(resizedImage, size: targetSize) else { From 431180acd57ef2dec2f6045c7622efdf50cd0cf4 Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 02:41:40 +0900 Subject: [PATCH 08/18] Fix Depth Pro ANE compilation failure: use cpuAndGPU ANE fails to compile the large DepthPro model. Switch to CPU+GPU compute units. Co-Authored-By: Claude Opus 4.6 (1M context) --- creative_apps/DepthProDemo/DepthProDemo/ContentView.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift index 7c0a855..8e3c951 100644 --- a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift +++ b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift @@ -686,8 +686,8 @@ class DepthProViewModel: ObservableObject { } let config = MLModelConfiguration() - // Use CPU + Neural Engine to minimize GPU memory pressure - config.computeUnits = .cpuAndNeuralEngine + // ANE compilation fails on this large model. Use CPU+GPU instead. + config.computeUnits = .cpuAndGPU await updateStatus("Loading model (requires 6GB+ RAM)...", progress: 0.2) let model = try MLModel(contentsOf: modelURL, configuration: config) From d614411c539424da0f7ce633c90895400935d30b Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 02:53:06 +0900 Subject: [PATCH 09/18] Fix Depth Pro model input/output format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Model input is MLMultiArray (not pixelBuffer): convert BGRA→RGB Float16 - Output name is 'var_4563' (auto-generated), with fallback to first output - Handle Float16 output with vImage conversion - Add Accelerate import for vImage Co-Authored-By: Claude Opus 4.6 (1M context) --- .../DepthProDemo/ContentView.swift | 93 +++++++++++++++++-- 1 file changed, 85 insertions(+), 8 deletions(-) diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift index 8e3c951..d1dafad 100644 --- a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift +++ b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift @@ -3,6 +3,7 @@ import UIKit import CoreML import PhotosUI import CoreMotion +import Accelerate // MARK: - Apple Depth Pro - Metric Depth Estimation Demo // @@ -708,17 +709,45 @@ class DepthProViewModel: ObservableObject { await updateStatus("Running depth estimation...", progress: 0.5) + // Model expects MLMultiArray input (1,3,1536,1536), not CVPixelBuffer + let inputArray = try MLMultiArray(shape: [1, 3, 1536, 1536], dataType: .float16) + fillMultiArrayFromPixelBuffer(pixelBuffer, into: inputArray, width: 1536, height: 1536) + let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ - "image": MLFeatureValue(pixelBuffer: pixelBuffer) + "image": MLFeatureValue(multiArray: inputArray) ]) let result = try model.prediction(from: inputFeatures) await updateStatus("Processing depth output...", progress: 0.8) - // Extract depth map - guard let depthMultiArray = result.featureValue(for: "depth")?.multiArrayValue else { - throw DepthProError.processingFailed("Model did not produce a depth output.") + // Output name is "var_4563" (auto-generated during conversion) + guard let depthMultiArray = result.featureValue(for: "var_4563")?.multiArrayValue else { + // Fallback: try first available output + let names = result.featureNames + guard let firstName = names.first, + let depthArray = result.featureValue(for: firstName)?.multiArrayValue else { + throw DepthProError.processingFailed("Model did not produce a depth output.") + } + // Use this fallback path + let shape2 = depthArray.shape.map { $0.intValue } + let dH2 = shape2.count >= 3 ? shape2[shape2.count - 2] : 1536 + let dW2 = shape2.count >= 2 ? shape2[shape2.count - 1] : 1536 + let totalPixels2 = dH2 * dW2 + let ptr2 = depthArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels2) + var depths2 = [Float](repeating: 0, count: totalPixels2) + for i in 0.. maxD2 { maxD2 = d }; sumD2 += d } + let meanD2 = sumD2 / Float(totalPixels2) + let depthImage2 = TurboColormap.depthMapImage(from: depths2, width: dW2, height: dH2, minDepth: minD2, maxDepth: maxD2) + await MainActor.run { + self.depthMapImage = depthImage2 + self.minDepth = minD2; self.maxDepth = maxD2; self.meanDepth = meanD2 + self.depthWidth = dW2; self.depthHeight = dH2; self.depthValues = depths2 + self.isProcessed = true; self.isProcessing = false + } + return } // Extract focal length if available @@ -745,12 +774,20 @@ class DepthProViewModel: ObservableObject { dW = 1536 } - // Copy depth values + // Copy depth values (handle both Float32 and Float16 output) let totalPixels = dH * dW - let pointer = depthMultiArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels) var depths = [Float](repeating: 0, count: totalPixels) - for i in 0.. UInt16 { + var f = value + var h: UInt16 = 0 + withUnsafePointer(to: &f) { src in + withUnsafeMutablePointer(to: &h) { dst in + var bufferFloat32 = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: src), height: 1, width: 1, rowBytes: 4) + var bufferFloat16 = vImage_Buffer(data: UnsafeMutableRawPointer(dst), height: 1, width: 1, rowBytes: 2) + vImageConvert_PlanarFtoPlanar16F(&bufferFloat32, &bufferFloat16, 0) + } + } + return h + } + // MARK: - Status Updates @MainActor From 699a9e5c12743365d36eacbff2ff2b1069efb39d Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 02:56:36 +0900 Subject: [PATCH 10/18] Fix DepthPro fallback: use depthStats instead of missing properties Also handle Float16 output in fallback path with vImage conversion. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../DepthProDemo/DepthProDemo/ContentView.swift | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift index d1dafad..50d0276 100644 --- a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift +++ b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift @@ -734,16 +734,25 @@ class DepthProViewModel: ObservableObject { let dH2 = shape2.count >= 3 ? shape2[shape2.count - 2] : 1536 let dW2 = shape2.count >= 2 ? shape2[shape2.count - 1] : 1536 let totalPixels2 = dH2 * dW2 - let ptr2 = depthArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels2) var depths2 = [Float](repeating: 0, count: totalPixels2) - for i in 0.. maxD2 { maxD2 = d }; sumD2 += d } let meanD2 = sumD2 / Float(totalPixels2) let depthImage2 = TurboColormap.depthMapImage(from: depths2, width: dW2, height: dH2, minDepth: minD2, maxDepth: maxD2) await MainActor.run { self.depthMapImage = depthImage2 - self.minDepth = minD2; self.maxDepth = maxD2; self.meanDepth = meanD2 + self.depthStats = DepthStats(min: minD2, max: maxD2, mean: meanD2) self.depthWidth = dW2; self.depthHeight = dH2; self.depthValues = depths2 self.isProcessed = true; self.isProcessing = false } From 27c98a33a212a388f82c034f7fbbf90d101124b7 Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 03:14:08 +0900 Subject: [PATCH 11/18] Remove Depth Pro (too heavy for iPhone), fix BiRefNet to 512x512 - Depth Pro (1.2GB, 1536x1536 fixed) crashes on all iPhones due to memory - Removed DepthProDemo app, conversion script, and README entry - BiRefNet: reduced input from 1024x1024 to 512x512 to fit iPhone memory - BiRefNet: switched from ANE to cpuAndGPU (ANE compilation fails) Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 9 - conversion_scripts/convert_depth_pro.py | 28 - .../BiRefNetDemo.xcodeproj/project.pbxproj | 39 +- .../BiRefNetDemo/ContentView.swift | 17 +- .../DepthProDemo.xcodeproj/project.pbxproj | 275 ----- .../AccentColor.colorset/Contents.json | 11 - .../AppIcon.appiconset/Contents.json | 13 - .../Assets.xcassets/Contents.json | 6 - .../DepthProDemo/ContentView.swift | 969 ------------------ .../DepthProDemo/DepthProDemoApp.swift | 10 - .../DepthProDemo/DepthProDemo/Info.plist | 8 - 11 files changed, 29 insertions(+), 1356 deletions(-) delete mode 100644 conversion_scripts/convert_depth_pro.py delete mode 100644 creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/ContentView.swift delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift delete mode 100644 creative_apps/DepthProDemo/DepthProDemo/Info.plist diff --git a/README.md b/README.md index a666bc9..d2c4d62 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,6 @@ You are free to do or not. - [**Monocular Depth Estimation (Next-Gen)**](#monocular-depth-estimation-next-gen) **:NEW** - [Depth Anything V2 Small](#depth-anything-v2-small) - - [Depth Pro](#depth-pro) - [**Object Detection (Next-Gen)**](#object-detection-next-gen) **:NEW** - [YOLOv10-N](#yolov10-n) @@ -1096,14 +1095,6 @@ Depth Anything V2 (TsingHua, 2024). State-of-the-art monocular depth estimation. | ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | | [DepthAnythingV2Small (TBD)] | 25 MB | 518x518 image | 518x518 relative depth map | [DepthAnything/Depth-Anything-V2](https://github.com/DepthAnything/Depth-Anything-V2) | [Apache 2.0](https://github.com/DepthAnything/Depth-Anything-V2/blob/main/LICENSE) | 2024 | [DepthAnythingV2Demo](sample_apps/DepthAnythingV2Demo) | -### Depth Pro - -Apple Depth Pro (Apple, 2024). Metric depth estimation from a single image. Predicts absolute distance in meters with estimated focal length. Ideal for AR applications. - -| Model | Size | Input | Output | Original Project | License | Year | Sample Project | -| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | -| [DepthPro (TBD)] | 150 MB | 1536x1536 image | metric depth map (meters) + focal length | [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) | [Apple Sample Code License](https://github.com/apple/ml-depth-pro/blob/main/LICENSE) | 2024 | [DepthProDemo](creative_apps/DepthProDemo) | - # Object Detection (Next-Gen) ### YOLOv10-N diff --git a/conversion_scripts/convert_depth_pro.py b/conversion_scripts/convert_depth_pro.py deleted file mode 100644 index c633f33..0000000 --- a/conversion_scripts/convert_depth_pro.py +++ /dev/null @@ -1,28 +0,0 @@ -# Depth Pro -> CoreML conversion -# Apple's official repo: https://github.com/apple/ml-depth-pro -# pip install depth-pro - -import torch -import coremltools as ct -import depth_pro - -# Load model -model, transform = depth_pro.create_model_and_transforms() -model.eval() - -# Trace with dummy input -dummy = torch.randn(1, 3, 1536, 1536) - -# Note: Depth Pro outputs both depth map and focal length -# For CoreML, we trace the model and convert -traced = torch.jit.trace(model, dummy) - -mlmodel = ct.convert( - traced, - inputs=[ct.ImageType(name="image", shape=(1, 3, 1536, 1536), scale=1/255.0)], - outputs=[ct.TensorType(name="depth"), ct.TensorType(name="focallength")], - minimum_deployment_target=ct.target.iOS16, - convert_to="mlprogram", -) -mlmodel.save("DepthPro.mlpackage") -print("Saved DepthPro.mlpackage") diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj index 95a56c9..062199f 100644 --- a/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo.xcodeproj/project.pbxproj @@ -7,17 +7,17 @@ objects = { /* Begin PBXBuildFile section */ - BR0001 /* BiRefNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0002; }; - BR0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0004; }; - BR0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = BR0006; }; + BR0001 /* BiRefNetDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0002 /* BiRefNetDemoApp.swift */; }; + BR0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BR0004 /* ContentView.swift */; }; + BR0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = BR0006 /* Assets.xcassets */; }; BRML02 /* BiRefNet.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = BRML01 /* BiRefNet.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - BR0007 /* BiRefNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = BiRefNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; BR0002 /* BiRefNetDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BiRefNetDemoApp.swift; sourceTree = ""; }; BR0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; BR0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + BR0007 /* BiRefNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = BiRefNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; BR0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; BRML01 /* BiRefNet.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = BiRefNet.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ @@ -48,7 +48,7 @@ BR0004 /* ContentView.swift */, BR0006 /* Assets.xcassets */, BR0008 /* Info.plist */, - BRML01 /* BiRefNet.mlpackage */, + BRML01 /* BiRefNet.mlpackage */, ); path = BiRefNetDemo; sourceTree = ""; @@ -66,7 +66,7 @@ /* Begin PBXNativeTarget section */ BR0013 /* BiRefNetDemo */ = { isa = PBXNativeTarget; - buildConfigurationList = BR0014; + buildConfigurationList = BR0014 /* Build configuration list for PBXNativeTarget "BiRefNetDemo" */; buildPhases = ( BR0015 /* Sources */, BR0009 /* Frameworks */, @@ -78,7 +78,7 @@ ); name = BiRefNetDemo; productName = BiRefNetDemo; - productReference = BR0007; + productReference = BR0007 /* BiRefNetDemo.app */; productType = "com.apple.product-type.application"; }; /* End PBXNativeTarget section */ @@ -96,7 +96,7 @@ }; }; }; - buildConfigurationList = BR0018; + buildConfigurationList = BR0018 /* Build configuration list for PBXProject "BiRefNetDemo" */; compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; @@ -105,11 +105,11 @@ Base, ); mainGroup = BR0010; - productRefGroup = BR0012; + productRefGroup = BR0012 /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( - BR0013, + BR0013 /* BiRefNetDemo */, ); }; /* End PBXProject section */ @@ -132,7 +132,7 @@ files = ( BR0001 /* BiRefNetDemoApp.swift in Sources */, BR0003 /* ContentView.swift in Sources */, - BRML02 /* BiRefNet.mlpackage in Sources */, + BRML02 /* BiRefNet.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -199,6 +199,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = BiRefNetDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -226,6 +227,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = BiRefNetDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -249,26 +251,25 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - BR0018 /* Build configuration list for PBXProject */ = { + BR0014 /* Build configuration list for PBXNativeTarget "BiRefNetDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - BR0019, - BR0020, + BR0021 /* Debug */, + BR0022 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - BR0014 /* Build configuration list for PBXNativeTarget */ = { + BR0018 /* Build configuration list for PBXProject "BiRefNetDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - BR0021, - BR0022, + BR0019 /* Debug */, + BR0020 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; - rootObject = BR0017; + rootObject = BR0017 /* Project object */; } diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift index e5cc6ca..f765e78 100644 --- a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift @@ -392,7 +392,8 @@ class BackgroundRemovalViewModel: ObservableObject { } let config = MLModelConfiguration() - config.computeUnits = .cpuAndNeuralEngine + // ANE compilation fails on this model. Use CPU+GPU. + config.computeUnits = .cpuAndGPU let model = try MLModel(contentsOf: modelURL, configuration: config) await MainActor.run { @@ -400,14 +401,14 @@ class BackgroundRemovalViewModel: ObservableObject { self.progressMessage = "Preparing image..." } - // Prepare input image (1, 3, 1024, 1024) - let targetSize = CGSize(width: 1024, height: 1024) + // Prepare input image (1, 3, 512, 512) + let targetSize = CGSize(width: 512, height: 512) guard let resizedCG = image.resized(to: targetSize)?.cgImage else { throw SegmentationError.imageProcessingFailed("Failed to resize input image") } - let inputArray = try MLMultiArray(shape: [1, 3, 1024, 1024], dataType: .float32) - fillMultiArrayFromImage(resizedCG, into: inputArray, size: 1024) + let inputArray = try MLMultiArray(shape: [1, 3, 512, 512], dataType: .float32) + fillMultiArrayFromImage(resizedCG, into: inputArray, size: 512) await MainActor.run { self.progress = 0.5 @@ -425,13 +426,13 @@ class BackgroundRemovalViewModel: ObservableObject { self.progressMessage = "Generating mask..." } - // Extract mask output (1, 1, 1024, 1024), apply sigmoid + // Extract mask output (1, 1, 512, 512), apply sigmoid guard let outputArray = prediction.featureValue(for: "mask")?.multiArrayValue else { throw SegmentationError.imageProcessingFailed("Failed to extract mask output from model") } - let width = 1024 - let height = 1024 + let width = 512 + let height = 512 var maskData = [Float](repeating: 0, count: width * height) let outputPointer = outputArray.dataPointer.bindMemory(to: Float.self, capacity: width * height) diff --git a/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj b/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj deleted file mode 100644 index f9d998a..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo.xcodeproj/project.pbxproj +++ /dev/null @@ -1,275 +0,0 @@ -// !$*UTF8*$! -{ - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { - -/* Begin PBXBuildFile section */ - DP0001 /* DepthProDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0002 /* DepthProDemoApp.swift */; }; - DP0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DP0004 /* ContentView.swift */; }; - DP0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DP0006 /* Assets.xcassets */; }; - DPML02 /* DepthPro.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = DPML01 /* DepthPro.mlpackage */; }; -/* End PBXBuildFile section */ - -/* Begin PBXFileReference section */ - DP0002 /* DepthProDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthProDemoApp.swift; sourceTree = ""; }; - DP0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - DP0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - DP0007 /* DepthProDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthProDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; - DP0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; - DPML01 /* DepthPro.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = DepthPro.mlpackage; sourceTree = ""; }; -/* End PBXFileReference section */ - -/* Begin PBXFrameworksBuildPhase section */ - DP0009 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXFrameworksBuildPhase section */ - -/* Begin PBXGroup section */ - DP0010 = { - isa = PBXGroup; - children = ( - DP0011 /* DepthProDemo */, - DP0012 /* Products */, - ); - sourceTree = ""; - }; - DP0011 /* DepthProDemo */ = { - isa = PBXGroup; - children = ( - DP0002 /* DepthProDemoApp.swift */, - DP0004 /* ContentView.swift */, - DP0006 /* Assets.xcassets */, - DP0008 /* Info.plist */, - DPML01 /* DepthPro.mlpackage */, - ); - path = DepthProDemo; - sourceTree = ""; - }; - DP0012 /* Products */ = { - isa = PBXGroup; - children = ( - DP0007 /* DepthProDemo.app */, - ); - name = Products; - sourceTree = ""; - }; -/* End PBXGroup section */ - -/* Begin PBXNativeTarget section */ - DP0013 /* DepthProDemo */ = { - isa = PBXNativeTarget; - buildConfigurationList = DP0014 /* Build configuration list for PBXNativeTarget "DepthProDemo" */; - buildPhases = ( - DP0015 /* Sources */, - DP0009 /* Frameworks */, - DP0016 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = DepthProDemo; - productName = DepthProDemo; - productReference = DP0007 /* DepthProDemo.app */; - productType = "com.apple.product-type.application"; - }; -/* End PBXNativeTarget section */ - -/* Begin PBXProject section */ - DP0017 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - DP0013 = { - CreatedOnToolsVersion = 15.0; - }; - }; - }; - buildConfigurationList = DP0018 /* Build configuration list for PBXProject "DepthProDemo" */; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = DP0010; - productRefGroup = DP0012 /* Products */; - projectDirPath = ""; - projectRoot = ""; - targets = ( - DP0013 /* DepthProDemo */, - ); - }; -/* End PBXProject section */ - -/* Begin PBXResourcesBuildPhase section */ - DP0016 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DP0005 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXResourcesBuildPhase section */ - -/* Begin PBXSourcesBuildPhase section */ - DP0015 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DP0001 /* DepthProDemoApp.swift in Sources */, - DP0003 /* ContentView.swift in Sources */, - DPML02 /* DepthPro.mlpackage in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXSourcesBuildPhase section */ - -/* Begin XCBuildConfiguration section */ - DP0019 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - GCC_DYNAMIC_NO_PIC = NO; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - DP0020 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - SWIFT_OPTIMIZATION_LEVEL = "-O"; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - DP0021 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = MFN25KNUGJ; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DepthProDemo/Info.plist; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthprodemo"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - DP0022 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = MFN25KNUGJ; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DepthProDemo/Info.plist; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthprodemo"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; -/* End XCBuildConfiguration section */ - -/* Begin XCConfigurationList section */ - DP0014 /* Build configuration list for PBXNativeTarget "DepthProDemo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DP0021 /* Debug */, - DP0022 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - DP0018 /* Build configuration list for PBXProject "DepthProDemo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DP0019 /* Debug */, - DP0020 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; -/* End XCConfigurationList section */ - }; - rootObject = DP0017 /* Project object */; -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json deleted file mode 100644 index eb87897..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AccentColor.colorset/Contents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "colors" : [ - { - "idiom" : "universal" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json deleted file mode 100644 index 13613e3..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/AppIcon.appiconset/Contents.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "images" : [ - { - "idiom" : "universal", - "platform" : "ios", - "size" : "1024x1024" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json b/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json deleted file mode 100644 index 73c0059..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/Assets.xcassets/Contents.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift b/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift deleted file mode 100644 index 50d0276..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/ContentView.swift +++ /dev/null @@ -1,969 +0,0 @@ -import SwiftUI -import UIKit -import CoreML -import PhotosUI -import CoreMotion -import Accelerate - -// MARK: - Apple Depth Pro - Metric Depth Estimation Demo -// -// Depth Pro produces metric (absolute) depth maps from a single image, -// along with an estimated focal length. Input: 1536x1536 (fixed). -// Outputs: depth map (meters) + focal length (pixels). -// -// NOTE: Requires iPhone 15 Pro or later (6GB+ RAM). -// The model is ~1.2GB and processes 1536x1536 input. -// -// Features: -// - PhotosPicker for image selection -// - Color-coded depth visualization (turbo colormap) -// - Tap to measure distance at any point -// - 3D parallax effect using CoreMotion -// - Before/After depth overlay slider -// - Focal length display -// - Save depth map as image - -// MARK: - Turbo Colormap - -struct TurboColormap { - /// Maps a normalized value [0,1] to a turbo colormap RGB tuple. - /// Blue = far (0.0), Red = near (1.0). - static func color(for value: Float) -> (r: UInt8, g: UInt8, b: UInt8) { - let t = max(0, min(1, value)) - let r = clampByte(34.61 + t * (1172.33 - t * (10793.56 - t * (33300.12 - t * (38394.49 - t * 14825.05))))) - let g = clampByte(23.31 + t * (557.33 + t * (1225.33 - t * (3574.96 - t * (1073.77 + t * 707.56))))) - let b = clampByte(27.2 + t * (3211.1 - t * (15327.97 - t * (27814.0 - t * (22569.18 - t * 6838.66))))) - return (r, g, b) - } - - private static func clampByte(_ v: Float) -> UInt8 { - return UInt8(max(0, min(255, Int(v)))) - } - - /// Generates a UIImage depth visualization from a depth float buffer. - static func depthMapImage(from depthValues: [Float], width: Int, height: Int, minDepth: Float, maxDepth: Float) -> UIImage? { - let count = width * height - guard depthValues.count >= count else { return nil } - - var pixelData = [UInt8](repeating: 255, count: count * 4) - let range = maxDepth - minDepth - let safeRange = range > 0 ? range : 1.0 - - for i in 0..= 0 && normX <= 1 && normY >= 0 && normY <= 1 { - viewModel.measureDepth( - atNormalized: CGPoint(x: normX, y: normY), - viewLocation: value.location - ) - } - } - ) - - // Measurement indicator - if let measurement = viewModel.pointMeasurement { - VStack(spacing: 2) { - Image(systemName: "mappin.circle.fill") - .font(.title2) - .foregroundColor(.white) - .shadow(radius: 3) - Text(String(format: "%.2f m", measurement.depth)) - .font(.caption) - .fontWeight(.bold) - .foregroundColor(.white) - .padding(.horizontal, 8) - .padding(.vertical, 4) - .background(Color.black.opacity(0.75)) - .cornerRadius(8) - } - .position(x: measurement.viewPoint.x, - y: max(40, measurement.viewPoint.y - 30)) - } - } - .aspectRatio(depthImg.size.width / depthImg.size.height, contentMode: .fit) - } - } - } - } - - // MARK: - 3D Parallax Section - - private var parallaxSection: some View { - VStack(spacing: 12) { - Text("3D Parallax Effect") - .font(.headline) - .frame(maxWidth: .infinity, alignment: .leading) - - Text("Tilt your device to see the depth-based parallax effect.") - .font(.caption) - .foregroundColor(.secondary) - .frame(maxWidth: .infinity, alignment: .leading) - - ZStack { - if let original = viewModel.selectedImage { - Image(uiImage: original) - .resizable() - .aspectRatio(contentMode: .fit) - .cornerRadius(12) - .offset( - x: CGFloat(motionManager.roll) * -8, - y: CGFloat(motionManager.pitch) * -8 - ) - } - - if let depthImg = viewModel.depthMapImage { - Image(uiImage: depthImg) - .resizable() - .aspectRatio(contentMode: .fit) - .cornerRadius(12) - .opacity(0.45) - .offset( - x: CGFloat(motionManager.roll) * 15, - y: CGFloat(motionManager.pitch) * 15 - ) - } - } - .clipped() - .cornerRadius(12) - .shadow(color: .black.opacity(0.15), radius: 8, y: 4) - } - } - - // MARK: - Save Section - - private var saveSection: some View { - VStack(spacing: 12) { - Button(action: { viewModel.saveDepthMap() }) { - HStack { - Image(systemName: viewModel.didSave ? "checkmark.circle.fill" : "square.and.arrow.down") - Text(viewModel.didSave ? "Saved to Photos" : "Save Depth Map") - } - .frame(maxWidth: .infinity) - .padding() - .background(viewModel.didSave ? Color.green : Color(.systemGray5)) - .foregroundColor(viewModel.didSave ? .white : .primary) - .cornerRadius(12) - } - .disabled(viewModel.didSave) - } - } - - // MARK: - Colormap Legend - - private var colormapLegendSection: some View { - VStack(spacing: 8) { - Text("Depth Colormap Legend") - .font(.caption) - .foregroundColor(.secondary) - .frame(maxWidth: .infinity, alignment: .leading) - - GeometryReader { geo in - let width = geo.size.width - HStack(spacing: 0) { - ForEach(0.. some View { - VStack(spacing: 4) { - Text(label) - .font(.caption2) - .foregroundColor(.secondary) - Text(String(format: "%.2f m", value)) - .font(.subheadline) - .fontWeight(.semibold) - .foregroundColor(color) - } - .frame(maxWidth: .infinity) - } -} - -// MARK: - Point Measurement Data - -struct PointMeasurement { - let depth: Float - let normalizedPoint: CGPoint - let viewPoint: CGPoint -} - -// MARK: - Depth Statistics - -struct DepthStats { - let min: Float - let max: Float - let mean: Float -} - -// MARK: - DepthPro ViewModel - -class DepthProViewModel: ObservableObject { - @Published var photoItem: PhotosPickerItem? { - didSet { loadImage() } - } - @Published var selectedImage: UIImage? - @Published var depthMapImage: UIImage? - @Published var depthValues: [Float] = [] - @Published var depthWidth: Int = 0 - @Published var depthHeight: Int = 0 - @Published var estimatedFocalLength: Float? - @Published var depthStats: DepthStats? - @Published var isProcessing = false - @Published var isProcessed = false - @Published var progress: Double = 0 - @Published var statusMessage = "" - @Published var errorMessage: String? - @Published var overlaySlider: Double = 0.5 - @Published var pointMeasurement: PointMeasurement? - @Published var didSave = false - - private func loadImage() { - guard let item = photoItem else { return } - reset() - Task { - do { - if let data = try await item.loadTransferable(type: Data.self), - let uiImage = UIImage(data: data) { - await MainActor.run { - self.selectedImage = uiImage - } - } - } catch { - await MainActor.run { - self.errorMessage = "Failed to load image: \(error.localizedDescription)" - } - } - } - } - - func reset() { - depthMapImage = nil - depthValues = [] - depthWidth = 0 - depthHeight = 0 - estimatedFocalLength = nil - depthStats = nil - isProcessed = false - isProcessing = false - progress = 0 - statusMessage = "" - errorMessage = nil - pointMeasurement = nil - didSave = false - } - - func estimateDepth() { - guard selectedImage != nil else { return } - isProcessing = true - errorMessage = nil - progress = 0 - - Task { - do { - try await performDepthEstimation() - await MainActor.run { - self.isProcessed = true - self.isProcessing = false - } - } catch { - await MainActor.run { - self.errorMessage = error.localizedDescription - self.isProcessing = false - } - } - } - } - - // MARK: - Core ML Inference - - private func performDepthEstimation() async throws { - await updateStatus("Loading Depth Pro model...", progress: 0.1) - - guard let modelURL = Bundle.main.url(forResource: "DepthPro", withExtension: "mlmodelc") else { - throw DepthProError.modelNotFound( - "DepthPro.mlmodelc not found in bundle. " + - "Please convert the model using convert_depth_pro.py, " + - "compile the .mlpackage, and add DepthPro.mlmodelc to the project." - ) - } - - let config = MLModelConfiguration() - // ANE compilation fails on this large model. Use CPU+GPU instead. - config.computeUnits = .cpuAndGPU - - await updateStatus("Loading model (requires 6GB+ RAM)...", progress: 0.2) - let model = try MLModel(contentsOf: modelURL, configuration: config) - - await updateStatus("Preprocessing image...", progress: 0.3) - - guard let inputImage = selectedImage else { - throw DepthProError.processingFailed("No image selected.") - } - - // Depth Pro requires exactly 1536x1536 input (ViT patch architecture constraint). - // Requires iPhone 15 Pro or later (6GB+ RAM). - let targetSize = CGSize(width: 1536, height: 1536) - guard let resizedImage = resizeImage(inputImage, to: targetSize), - let pixelBuffer = pixelBufferFromImage(resizedImage, size: targetSize) else { - throw DepthProError.processingFailed("Failed to preprocess image for model input.") - } - - await updateStatus("Running depth estimation...", progress: 0.5) - - // Model expects MLMultiArray input (1,3,1536,1536), not CVPixelBuffer - let inputArray = try MLMultiArray(shape: [1, 3, 1536, 1536], dataType: .float16) - fillMultiArrayFromPixelBuffer(pixelBuffer, into: inputArray, width: 1536, height: 1536) - - let inputFeatures = try MLDictionaryFeatureProvider(dictionary: [ - "image": MLFeatureValue(multiArray: inputArray) - ]) - - let result = try model.prediction(from: inputFeatures) - - await updateStatus("Processing depth output...", progress: 0.8) - - // Output name is "var_4563" (auto-generated during conversion) - guard let depthMultiArray = result.featureValue(for: "var_4563")?.multiArrayValue else { - // Fallback: try first available output - let names = result.featureNames - guard let firstName = names.first, - let depthArray = result.featureValue(for: firstName)?.multiArrayValue else { - throw DepthProError.processingFailed("Model did not produce a depth output.") - } - // Use this fallback path - let shape2 = depthArray.shape.map { $0.intValue } - let dH2 = shape2.count >= 3 ? shape2[shape2.count - 2] : 1536 - let dW2 = shape2.count >= 2 ? shape2[shape2.count - 1] : 1536 - let totalPixels2 = dH2 * dW2 - var depths2 = [Float](repeating: 0, count: totalPixels2) - if depthArray.dataType == .float32 { - let ptr2 = depthArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels2) - for i in 0.. maxD2 { maxD2 = d }; sumD2 += d } - let meanD2 = sumD2 / Float(totalPixels2) - let depthImage2 = TurboColormap.depthMapImage(from: depths2, width: dW2, height: dH2, minDepth: minD2, maxDepth: maxD2) - await MainActor.run { - self.depthMapImage = depthImage2 - self.depthStats = DepthStats(min: minD2, max: maxD2, mean: meanD2) - self.depthWidth = dW2; self.depthHeight = dH2; self.depthValues = depths2 - self.isProcessed = true; self.isProcessing = false - } - return - } - - // Extract focal length if available - var focalLength: Float? = nil - if let focalArray = result.featureValue(for: "focallength")?.multiArrayValue { - focalLength = focalArray[0].floatValue - } - - // Parse depth map dimensions - let shape = depthMultiArray.shape.map { $0.intValue } - let dH: Int - let dW: Int - if shape.count == 4 { - dH = shape[2] - dW = shape[3] - } else if shape.count == 3 { - dH = shape[1] - dW = shape[2] - } else if shape.count == 2 { - dH = shape[0] - dW = shape[1] - } else { - dH = 1536 - dW = 1536 - } - - // Copy depth values (handle both Float32 and Float16 output) - let totalPixels = dH * dW - var depths = [Float](repeating: 0, count: totalPixels) - if depthMultiArray.dataType == .float32 { - let pointer = depthMultiArray.dataPointer.bindMemory(to: Float.self, capacity: totalPixels) - for i in 0.. maxD { maxD = d } - sumD += d - } - let meanD = sumD / Float(totalPixels) - - // Generate colorized depth image - let depthImage = TurboColormap.depthMapImage(from: depths, width: dW, height: dH, minDepth: minD, maxDepth: maxD) - - await updateStatus("Complete!", progress: 1.0) - - await MainActor.run { - self.depthValues = depths - self.depthWidth = dW - self.depthHeight = dH - self.depthMapImage = depthImage - self.estimatedFocalLength = focalLength - self.depthStats = DepthStats(min: minD, max: maxD, mean: meanD) - } - } - - // MARK: - Measure Depth at Point - - func measureDepth(atNormalized point: CGPoint, viewLocation: CGPoint) { - guard !depthValues.isEmpty, depthWidth > 0, depthHeight > 0 else { return } - - let px = Int(point.x * CGFloat(depthWidth)) - let py = Int(point.y * CGFloat(depthHeight)) - let clampedX = max(0, min(depthWidth - 1, px)) - let clampedY = max(0, min(depthHeight - 1, py)) - let index = clampedY * depthWidth + clampedX - - guard index >= 0 && index < depthValues.count else { return } - let depth = depthValues[index] - - pointMeasurement = PointMeasurement( - depth: depth, - normalizedPoint: point, - viewPoint: viewLocation - ) - } - - // MARK: - Save Depth Map - - func saveDepthMap() { - guard let image = depthMapImage else { return } - UIImageWriteToSavedPhotosAlbum(image, nil, nil, nil) - didSave = true - } - - // MARK: - Image Utilities - - private func resizeImage(_ image: UIImage, to size: CGSize) -> UIImage? { - UIGraphicsBeginImageContextWithOptions(size, true, 1.0) - image.draw(in: CGRect(origin: .zero, size: size)) - let resized = UIGraphicsGetImageFromCurrentImageContext() - UIGraphicsEndImageContext() - return resized - } - - private func pixelBufferFromImage(_ image: UIImage, size: CGSize) -> CVPixelBuffer? { - let width = Int(size.width) - let height = Int(size.height) - let attrs: [CFString: Any] = [ - kCVPixelBufferCGImageCompatibilityKey: true, - kCVPixelBufferCGBitmapContextCompatibilityKey: true - ] - var pixelBuffer: CVPixelBuffer? - let status = CVPixelBufferCreate( - kCFAllocatorDefault, width, height, - kCVPixelFormatType_32BGRA, - attrs as CFDictionary, - &pixelBuffer - ) - guard status == kCVReturnSuccess, let buffer = pixelBuffer else { return nil } - - CVPixelBufferLockBaseAddress(buffer, []) - defer { CVPixelBufferUnlockBaseAddress(buffer, []) } - - guard let context = CGContext( - data: CVPixelBufferGetBaseAddress(buffer), - width: width, - height: height, - bitsPerComponent: 8, - bytesPerRow: CVPixelBufferGetBytesPerRow(buffer), - space: CGColorSpaceCreateDeviceRGB(), - bitmapInfo: CGBitmapInfo.byteOrder32Little.rawValue | CGImageAlphaInfo.premultipliedFirst.rawValue - ) else { return nil } - - guard let cgImage = image.cgImage else { return nil } - context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) - - return buffer - } - - // MARK: - PixelBuffer to MLMultiArray - - private func fillMultiArrayFromPixelBuffer(_ buffer: CVPixelBuffer, into array: MLMultiArray, width: Int, height: Int) { - CVPixelBufferLockBaseAddress(buffer, .readOnly) - defer { CVPixelBufferUnlockBaseAddress(buffer, .readOnly) } - - guard let baseAddress = CVPixelBufferGetBaseAddress(buffer) else { return } - let bytesPerRow = CVPixelBufferGetBytesPerRow(buffer) - let ptr = baseAddress.assumingMemoryBound(to: UInt8.self) - - // BGRA → RGB normalized to [0,1], stored as Float16 - let fp16Ptr = array.dataPointer.bindMemory(to: UInt16.self, capacity: 3 * width * height) - for y in 0.. UInt16 { - var f = value - var h: UInt16 = 0 - withUnsafePointer(to: &f) { src in - withUnsafeMutablePointer(to: &h) { dst in - var bufferFloat32 = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: src), height: 1, width: 1, rowBytes: 4) - var bufferFloat16 = vImage_Buffer(data: UnsafeMutableRawPointer(dst), height: 1, width: 1, rowBytes: 2) - vImageConvert_PlanarFtoPlanar16F(&bufferFloat32, &bufferFloat16, 0) - } - } - return h - } - - // MARK: - Status Updates - - @MainActor - private func updateStatus(_ message: String, progress: Double) { - self.statusMessage = message - self.progress = progress - } -} - -// MARK: - Errors - -enum DepthProError: LocalizedError { - case modelNotFound(String) - case processingFailed(String) - - var errorDescription: String? { - switch self { - case .modelNotFound(let msg): return msg - case .processingFailed(let msg): return msg - } - } -} - -// MARK: - Preview - -#Preview { - ContentView() -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift b/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift deleted file mode 100644 index a306119..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/DepthProDemoApp.swift +++ /dev/null @@ -1,10 +0,0 @@ -import SwiftUI - -@main -struct DepthProDemoApp: App { - var body: some Scene { - WindowGroup { - ContentView() - } - } -} diff --git a/creative_apps/DepthProDemo/DepthProDemo/Info.plist b/creative_apps/DepthProDemo/DepthProDemo/Info.plist deleted file mode 100644 index 8e27fd4..0000000 --- a/creative_apps/DepthProDemo/DepthProDemo/Info.plist +++ /dev/null @@ -1,8 +0,0 @@ - - - - - NSPhotoLibraryUsageDescription - This app needs photo library access for selecting images for depth estimation. - - From f93089ea2eb8efdff0245277b2775e490b481d9a Mon Sep 17 00:00:00 2001 From: MLBoy_DaisukeMajima Date: Tue, 31 Mar 2026 03:22:28 +0900 Subject: [PATCH 12/18] Fix BiRefNet Float16 input/output handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model uses Float16, not Float32. Reading Float32 from Float16 buffer produced garbage → NaN → UInt8 crash. - Input: write as Float16 via vImage conversion - Output: read as Float16 and convert to Float32 via vImage - Add NaN guard in mask-to-image conversion - Add Accelerate import for vImage Co-Authored-By: Claude Opus 4.6 (1M context) --- .../BiRefNetDemo/ContentView.swift | 54 ++++++++++++++----- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift index f765e78..ee07dce 100644 --- a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift @@ -4,6 +4,7 @@ import CoreML import Vision import PhotosUI import Photos +import Accelerate // MARK: - Background Removal using BiRefNet // BiRefNet is a bilateral reference network for high-resolution dichotomous image segmentation. @@ -407,7 +408,7 @@ class BackgroundRemovalViewModel: ObservableObject { throw SegmentationError.imageProcessingFailed("Failed to resize input image") } - let inputArray = try MLMultiArray(shape: [1, 3, 512, 512], dataType: .float32) + let inputArray = try MLMultiArray(shape: [1, 3, 512, 512], dataType: .float16) fillMultiArrayFromImage(resizedCG, into: inputArray, size: 512) await MainActor.run { @@ -433,12 +434,20 @@ class BackgroundRemovalViewModel: ObservableObject { let width = 512 let height = 512 - var maskData = [Float](repeating: 0, count: width * height) - - let outputPointer = outputArray.dataPointer.bindMemory(to: Float.self, capacity: width * height) - for i in 0..<(width * height) { - let raw = outputPointer[i] - maskData[i] = 1.0 / (1.0 + exp(-raw)) // sigmoid + let totalPixels = width * height + var maskData = [Float](repeating: 0, count: totalPixels) + + // Output is Float16 - read as UInt16 and convert to Float32 + let fp16Ptr = outputArray.dataPointer.bindMemory(to: UInt16.self, capacity: totalPixels) + var rawFloats = [Float](repeating: 0, count: totalPixels) + var srcBuf = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: fp16Ptr), height: 1, width: vImagePixelCount(totalPixels), rowBytes: totalPixels * 2) + rawFloats.withUnsafeMutableBufferPointer { dstBufPtr in + var dstBuf = vImage_Buffer(data: dstBufPtr.baseAddress!, height: 1, width: vImagePixelCount(totalPixels), rowBytes: totalPixels * 4) + vImageConvert_Planar16FtoPlanarF(&srcBuf, &dstBuf, 0) + } + for i in 0.. Date: Tue, 31 Mar 2026 03:24:36 +0900 Subject: [PATCH 13/18] Fix BiRefNet cutout rotation: normalize image orientation before masking UIImage from PhotosPicker can have rotation metadata (imageOrientation). CGImage ignores this, causing a 90-degree mismatch between mask and cutout. Normalize to .up orientation before extracting CGImage pixels. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../BiRefNetDemo/ContentView.swift | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift index ee07dce..1b383b0 100644 --- a/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift +++ b/creative_apps/BiRefNetDemo/BiRefNetDemo/ContentView.swift @@ -561,10 +561,12 @@ class BackgroundRemovalViewModel: ObservableObject { private func applyMask(to image: UIImage, maskData: [Float], maskWidth: Int, maskHeight: Int, background: UIColor?) -> UIImage? { - let origWidth = Int(image.size.width) - let origHeight = Int(image.size.height) + // Normalize orientation first to avoid rotation mismatch + let normalizedImage = normalizeOrientation(image) + let origWidth = Int(normalizedImage.size.width) + let origHeight = Int(normalizedImage.size.height) - guard let cgImage = image.cgImage else { return nil } + guard let cgImage = normalizedImage.cgImage else { return nil } let bytesPerPixel = 4 let bytesPerRow = bytesPerPixel * origWidth @@ -634,6 +636,16 @@ class BackgroundRemovalViewModel: ObservableObject { return UIImage(cgImage: outputCG) } + + /// Redraw UIImage with .up orientation to strip rotation metadata + private func normalizeOrientation(_ image: UIImage) -> UIImage { + guard image.imageOrientation != .up else { return image } + UIGraphicsBeginImageContextWithOptions(image.size, false, image.scale) + image.draw(in: CGRect(origin: .zero, size: image.size)) + let normalized = UIGraphicsGetImageFromCurrentImageContext() + UIGraphicsEndImageContext() + return normalized ?? image + } } // MARK: - Errors From 4fcde2030f81cc3b72732a9e923f85c609fad461 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Tue, 31 Mar 2026 18:43:18 +0900 Subject: [PATCH 14/18] Remove apps with existing official implementations, implement LivePortrait pipeline - Remove DepthAnythingV2Demo (Apple official CoreML model available) - Remove WhisperDemo (WhisperKit provides full implementation) - Remove DWPoseDemo (Apple Vision API has built-in pose detection) - Remove corresponding conversion scripts - Update README to reference official implementations - Fix YOLOv10Demo to parse raw MultiArray output [1,300,6] - Implement full LivePortrait animation pipeline with 4-model inference - Add AppIcon.appiconset to YOLOv10Demo --- README.md | 56 +- .../convert_depth_anything_v2.py | 24 - conversion_scripts/convert_dwpose.py | 25 - conversion_scripts/convert_whisper.py | 36 - .../project.pbxproj | 43 +- .../LivePortraitDemo/ContentView.swift | 740 +++++++++------- .../WhisperDemo.xcodeproj/project.pbxproj | 274 ------ .../AccentColor.colorset/Contents.json | 11 - .../WhisperDemo/Assets.xcassets/Contents.json | 6 - .../WhisperDemo/WhisperDemo/ContentView.swift | 830 ------------------ .../WhisperDemo/WhisperDemo/Info.plist | 8 - .../WhisperDemo/WhisperDemoApp.swift | 10 - .../DWPoseDemo.xcodeproj/project.pbxproj | 340 ------- .../AccentColor.colorset/Contents.json | 11 - .../DWPoseDemo/Assets.xcassets/Contents.json | 6 - .../DWPoseDemo/DWPoseDemo/ContentView.swift | 659 -------------- .../DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift | 10 - sample_apps/DWPoseDemo/DWPoseDemo/Info.plist | 8 - .../project.pbxproj | 344 -------- .../AccentColor.colorset/Contents.json | 11 - .../Assets.xcassets/Contents.json | 6 - .../DepthAnythingV2Demo/ContentView.swift | 438 --------- .../DepthAnythingV2DemoApp.swift | 10 - .../DepthAnythingV2Demo/Info.plist | 8 - .../AppIcon.appiconset/Contents.json | 0 .../YOLOv10Demo/YOLOv10Demo/ContentView.swift | 44 +- 26 files changed, 520 insertions(+), 3438 deletions(-) delete mode 100644 conversion_scripts/convert_depth_anything_v2.py delete mode 100644 conversion_scripts/convert_dwpose.py delete mode 100644 conversion_scripts/convert_whisper.py delete mode 100644 creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj delete mode 100644 creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json delete mode 100644 creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json delete mode 100644 creative_apps/WhisperDemo/WhisperDemo/ContentView.swift delete mode 100644 creative_apps/WhisperDemo/WhisperDemo/Info.plist delete mode 100644 creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift delete mode 100644 sample_apps/DWPoseDemo/DWPoseDemo/Info.plist delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift delete mode 100644 sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist rename {creative_apps/WhisperDemo/WhisperDemo => sample_apps/YOLOv10Demo/YOLOv10Demo}/Assets.xcassets/AppIcon.appiconset/Contents.json (100%) diff --git a/README.md b/README.md index d2c4d62..c0ae5e2 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,7 @@ You are free to do or not. - [**Image Deblurring**](#image-deblurring) **:NEW** - [NAFNet](#nafnet) -- [**Monocular Depth Estimation (Next-Gen)**](#monocular-depth-estimation-next-gen) **:NEW** - - [Depth Anything V2 Small](#depth-anything-v2-small) +- [**Monocular Depth Estimation (Next-Gen)**](#monocular-depth-estimation-next-gen) — [Official CoreML](https://huggingface.co/apple/coreml-depth-anything-v2-small) - [**Object Detection (Next-Gen)**](#object-detection-next-gen) **:NEW** - [YOLOv10-N](#yolov10-n) @@ -145,8 +144,7 @@ You are free to do or not. - [**Background Removal (SOTA)**](#background-removal-sota) **:NEW** - [BiRefNet](#birefnet) -- [**Speech Recognition**](#speech-recognition) **:NEW** - - [Whisper Tiny](#whisper-tiny) +- [**Speech Recognition**](#speech-recognition) — [WhisperKit](https://github.com/argmaxinc/WhisperKit) - [**Text-to-Speech**](#text-to-speech) **:NEW** - [Kokoro-82M](#kokoro-82m) @@ -157,8 +155,7 @@ You are free to do or not. - [**Open-Vocabulary Detection**](#open-vocabulary-detection) **:NEW** - [YOLOE-S](#yoloe-s) -- [**Pose Estimation**](#pose-estimation) **:NEW** - - [DWPose / RTMPose](#dwpose--rtmpose) +- [**Pose Estimation**](#pose-estimation) — [Apple Vision API](https://developer.apple.com/documentation/vision/vndetecthumanbodyposerequest) - [**Multilingual OCR**](#multilingual-ocr) **:NEW** - [PP-OCRv5](#pp-ocrv5) @@ -266,19 +263,15 @@ A ConvNet for the 2020s. Pure CNN architecture that competes with Vision Transfo ### FastViT-T8 -Apple's hybrid vision transformer. Ultra-fast inference with structural reparameterization. 76.2% top-1 accuracy. - -| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | -| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | -| FastViT-T8 (TBD) | 7.8 MB | ImageNet | [apple/ml-fastvit](https://github.com/apple/ml-fastvit) | [Apple](https://github.com/apple/ml-fastvit/blob/main/LICENSE)|2023| [FastViTDemo](sample_apps/FastViTDemo) | +> **Official CoreML model and sample app available:** +> - CoreML Model: [apple/coreml-FastViT-T8](https://huggingface.co/apple/coreml-FastViT-T8) +> - iOS Sample: [huggingface/coreml-examples/FastViTSample](https://github.com/huggingface/coreml-examples/tree/main/FastViTSample) +> - Source: [apple/ml-fastvit](https://github.com/apple/ml-fastvit) ### MobileOne-S0 -Apple's sub-millisecond mobile backbone. Optimized for on-device inference with reparameterizable architecture. 71.4% top-1 accuracy. - -| Google Drive Link | Size | Dataset |Original Project | License |Year| Sample Project | -| ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- | -| MobileOne-S0 (TBD) | 10.4 MB | ImageNet | [apple/ml-mobileone](https://github.com/apple/ml-mobileone) | [Apple](https://github.com/apple/ml-mobileone/blob/main/LICENSE)|2022| [MobileOneDemo](sample_apps/MobileOneDemo) | +> **Official CoreML model and benchmark app available:** +> - CoreML Model + iOS App: [apple/ml-mobileone](https://github.com/apple/ml-mobileone) ### EfficientFormerV2-S0 @@ -1089,11 +1082,11 @@ Nonlinear Activation Free Network. State-of-the-art image deblurring without non ### Depth Anything V2 Small -Depth Anything V2 (TsingHua, 2024). State-of-the-art monocular depth estimation. Massively improved over MiDaS with synthetic data training. The Small variant is extremely lightweight (~25 MB). +Depth Anything V2 (TsingHua, 2024). State-of-the-art monocular depth estimation. -| Model | Size | Input | Output | Original Project | License | Year | Sample Project | -| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | -| [DepthAnythingV2Small (TBD)] | 25 MB | 518x518 image | 518x518 relative depth map | [DepthAnything/Depth-Anything-V2](https://github.com/DepthAnything/Depth-Anything-V2) | [Apache 2.0](https://github.com/DepthAnything/Depth-Anything-V2/blob/main/LICENSE) | 2024 | [DepthAnythingV2Demo](sample_apps/DepthAnythingV2Demo) | +> **Official CoreML model and iOS sample app available:** +> - CoreML Model: [apple/coreml-depth-anything-v2-small](https://huggingface.co/apple/coreml-depth-anything-v2-small) +> - iOS Sample: [huggingface/coreml-examples/depth-anything-example](https://github.com/huggingface/coreml-examples/tree/main/depth-anything-example) # Object Detection (Next-Gen) @@ -1117,15 +1110,13 @@ Bilateral Reference Network (2024). State-of-the-art dichotomous image segmentat # Speech Recognition -### Whisper Tiny - -OpenAI Whisper Tiny (OpenAI, 2023). Multilingual speech-to-text model supporting 99+ languages. The Tiny variant (~75 MB) is ideal for on-device transcription. Apple provides official CoreML conversion via WhisperKit. +### Whisper -| Model | Size | Input | Output | Original Project | License | Year | Sample Project | -| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | -| [WhisperTinyEncoder (TBD)] | 75 MB | mel spectrogram (1,80,3000) | encoder hidden states | [openai/whisper](https://github.com/openai/whisper) | [MIT](https://github.com/openai/whisper/blob/main/LICENSE) | 2023 | [WhisperDemo](creative_apps/WhisperDemo) | +OpenAI Whisper (OpenAI, 2023). Multilingual speech-to-text model supporting 99+ languages. -Note: For production use, consider [WhisperKit](https://github.com/argmaxinc/WhisperKit) which provides optimized CoreML models with full encoder+decoder pipeline. +> **Full CoreML implementation available:** +> - [argmaxinc/WhisperKit](https://github.com/argmaxinc/WhisperKit) — Optimized CoreML models (Tiny to Large) with full encoder+decoder pipeline, Swift Package, MIT license +> - CoreML Models: [argmaxinc/whisperkit-coreml](https://huggingface.co/argmaxinc/whisperkit-coreml) # Text-to-Speech @@ -1163,13 +1154,12 @@ YOLOE-S (Tsinghua, ICCV 2025). Real-time open-vocabulary object detection and se # Pose Estimation -### DWPose / RTMPose +### Human Body Pose -DWPose + RTMPose (2023-2025). Real-time whole-body pose estimation with 133 keypoints (body, hands, face, feet). DWPose uses distillation from larger models for excellent accuracy in a compact package. 70+ FPS on mobile. - -| Model | Size | Input | Output | Original Project | License | Year | Sample Project | -| ----- | ---- | ----- | ------ | ---------------- | ------- | ---- | -------------- | -| [DWPose (TBD)] | 15-54 MB | 256x192 image | 17-133 keypoint heatmaps (SimCC) | [IDEA-Research/DWPose](https://github.com/IDEA-Research/DWPose) | [Apache 2.0](https://github.com/open-mmlab/mmpose/blob/main/LICENSE) | 2023 | [DWPoseDemo](sample_apps/DWPoseDemo) | +> **Built-in to Apple Vision framework:** +> - [`VNDetectHumanBodyPoseRequest`](https://developer.apple.com/documentation/vision/vndetecthumanbodyposerequest) — 19 body keypoints, no model download needed +> - [`VNDetectHumanBodyPose3DRequest`](https://developer.apple.com/documentation/vision/vndetecthumanbodypose3drequest) — 3D pose estimation (iOS 17+) +> - For more keypoints (hands, face), see also [`VNDetectHumanHandPoseRequest`](https://developer.apple.com/documentation/vision/vndetecthumanhandposerequest) # Multilingual OCR diff --git a/conversion_scripts/convert_depth_anything_v2.py b/conversion_scripts/convert_depth_anything_v2.py deleted file mode 100644 index d2b3205..0000000 --- a/conversion_scripts/convert_depth_anything_v2.py +++ /dev/null @@ -1,24 +0,0 @@ -# Depth Anything V2 Small -> CoreML conversion -# pip install torch torchvision coremltools transformers -import torch -import coremltools as ct -from transformers import AutoModelForDepthEstimation, AutoImageProcessor - -model_name = "depth-anything/Depth-Anything-V2-Small-hf" -model = AutoModelForDepthEstimation.from_pretrained(model_name) -model.eval() - -# Trace -dummy = torch.randn(1, 3, 518, 518) -traced = torch.jit.trace(model, dummy) - -# Convert -mlmodel = ct.convert( - traced, - inputs=[ct.ImageType(name="image", shape=(1, 3, 518, 518), scale=1/255.0, bias=[0, 0, 0])], - outputs=[ct.TensorType(name="depth")], - minimum_deployment_target=ct.target.iOS16, - convert_to="mlprogram", -) -mlmodel.save("DepthAnythingV2Small.mlpackage") -print("Saved DepthAnythingV2Small.mlpackage") diff --git a/conversion_scripts/convert_dwpose.py b/conversion_scripts/convert_dwpose.py deleted file mode 100644 index 77ab7ec..0000000 --- a/conversion_scripts/convert_dwpose.py +++ /dev/null @@ -1,25 +0,0 @@ -# DWPose / RTMPose -> CoreML conversion -# DWPose uses RTMPose as backbone with distillation -# pip install torch coremltools onnx onnxruntime - -import coremltools as ct -import onnx - -# Download RTMPose ONNX model from: -# https://github.com/open-mmlab/mmpose/tree/main/projects/rtmpose -# rtmpose-m_simcc-body7_pt-body7_420e-256x192.onnx - -# For whole-body (133 keypoints): -# dwpose: rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288.onnx - -onnx_path = "rtmpose-m_simcc-body7_pt-body7_420e-256x192.onnx" -onnx_model = onnx.load(onnx_path) - -mlmodel = ct.converters.convert( - onnx_model, - inputs=[ct.ImageType(name="image", shape=(1, 3, 256, 192), scale=1/255.0)], - minimum_deployment_target=ct.target.iOS16, - convert_to="mlprogram", -) -mlmodel.save("DWPose.mlpackage") -print("Saved DWPose.mlpackage") diff --git a/conversion_scripts/convert_whisper.py b/conversion_scripts/convert_whisper.py deleted file mode 100644 index a0fb97c..0000000 --- a/conversion_scripts/convert_whisper.py +++ /dev/null @@ -1,36 +0,0 @@ -# Whisper Tiny -> CoreML conversion -# Apple provides official conversion via whisperkittools -# pip install whisperkittools -# Alternatively, use huggingface optimum: -# pip install optimum[exporters] - -# Method 1: Using whisperkit (recommended) -# python -m whisperkittools.generate_model --model openai/whisper-tiny --output-dir . - -# Method 2: Manual conversion -import torch -import coremltools as ct -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -model_name = "openai/whisper-tiny" -model = WhisperForConditionalGeneration.from_pretrained(model_name) -processor = WhisperProcessor.from_pretrained(model_name) -model.eval() - -# Convert encoder -encoder = model.get_encoder() -mel_input = torch.randn(1, 80, 3000) -traced_encoder = torch.jit.trace(encoder, mel_input) - -encoder_ml = ct.convert( - traced_encoder, - inputs=[ct.TensorType(name="mel_input", shape=(1, 80, 3000))], - outputs=[ct.TensorType(name="encoder_output")], - minimum_deployment_target=ct.target.iOS16, - convert_to="mlprogram", -) -encoder_ml.save("WhisperTinyEncoder.mlpackage") -print("Saved WhisperTinyEncoder.mlpackage") - -# Note: Decoder conversion requires more complex handling for autoregressive generation. -# For production use, consider using WhisperKit or Apple's pre-converted models. diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj b/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj index 60bc77e..f717bc9 100644 --- a/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo.xcodeproj/project.pbxproj @@ -7,20 +7,20 @@ objects = { /* Begin PBXBuildFile section */ - C10001 /* LivePortraitDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10002; }; - C10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10004; }; - C10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C10006; }; - C1LP02 /* LivePortrait_MotionExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP01; }; - C1LP04 /* LivePortrait_AppearanceExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP03; }; - C1LP06 /* LivePortrait_WarpingNetwork.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP05; }; - C1LP08 /* LivePortrait_SPADEGenerator.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP07; }; + C10001 /* LivePortraitDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10002 /* LivePortraitDemoApp.swift */; }; + C10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C10004 /* ContentView.swift */; }; + C10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C10006 /* Assets.xcassets */; }; + C1LP02 /* LivePortrait_MotionExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP01 /* LivePortrait_MotionExtractor.mlpackage */; }; + C1LP04 /* LivePortrait_AppearanceExtractor.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP03 /* LivePortrait_AppearanceExtractor.mlpackage */; }; + C1LP06 /* LivePortrait_WarpingNetwork.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP05 /* LivePortrait_WarpingNetwork.mlpackage */; }; + C1LP08 /* LivePortrait_SPADEGenerator.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = C1LP07 /* LivePortrait_SPADEGenerator.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - C10007 /* LivePortraitDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LivePortraitDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; C10002 /* LivePortraitDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LivePortraitDemoApp.swift; sourceTree = ""; }; C10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; C10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + C10007 /* LivePortraitDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LivePortraitDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; C10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; C1LP01 /* LivePortrait_MotionExtractor.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_MotionExtractor.mlpackage; sourceTree = ""; }; C1LP03 /* LivePortrait_AppearanceExtractor.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = LivePortrait_AppearanceExtractor.mlpackage; sourceTree = ""; }; @@ -75,7 +75,7 @@ /* Begin PBXNativeTarget section */ C10013 /* LivePortraitDemo */ = { isa = PBXNativeTarget; - buildConfigurationList = C10014; + buildConfigurationList = C10014 /* Build configuration list for PBXNativeTarget "LivePortraitDemo" */; buildPhases = ( C10015 /* Sources */, C10009 /* Frameworks */, @@ -87,7 +87,7 @@ ); name = LivePortraitDemo; productName = LivePortraitDemo; - productReference = C10007; + productReference = C10007 /* LivePortraitDemo.app */; productType = "com.apple.product-type.application"; }; /* End PBXNativeTarget section */ @@ -105,7 +105,7 @@ }; }; }; - buildConfigurationList = C10018; + buildConfigurationList = C10018 /* Build configuration list for PBXProject "LivePortraitDemo" */; compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; @@ -114,11 +114,11 @@ Base, ); mainGroup = C10010; - productRefGroup = C10012; + productRefGroup = C10012 /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( - C10013, + C10013 /* LivePortraitDemo */, ); }; /* End PBXProject section */ @@ -211,6 +211,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = LivePortraitDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -238,6 +239,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = LivePortraitDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -261,26 +263,25 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - C10018 /* Build configuration list for PBXProject */ = { + C10014 /* Build configuration list for PBXNativeTarget "LivePortraitDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - C10019, - C10020, + C10021 /* Debug */, + C10022 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - C10014 /* Build configuration list for PBXNativeTarget */ = { + C10018 /* Build configuration list for PBXProject "LivePortraitDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - C10021, - C10022, + C10019 /* Debug */, + C10020 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; - rootObject = C10017; + rootObject = C10017 /* Project object */; } diff --git a/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift b/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift index d2a9a6b..3101001 100644 --- a/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift +++ b/creative_apps/LivePortraitDemo/LivePortraitDemo/ContentView.swift @@ -1,22 +1,11 @@ import SwiftUI import UIKit import CoreML -import Vision import PhotosUI -import AVKit +import AVFoundation +import Accelerate -// MARK: - LivePortrait: Portrait Animation via Multi-Model Pipeline -// -// Pipeline stages: -// 1. MotionExtractor - Extracts 3D motion parameters (pitch, yaw, roll, expression, translation) -// from each driving video frame -// 2. AppearanceExtractor - Extracts appearance features from the source portrait -// 3. WarpingNetwork - Warps source appearance using motion deltas between source and driving -// 4. SPADEGenerator - Generates the final animated frame from warped features -// -// Each model is loaded independently and run in sequence for each frame. - -// MARK: - Pipeline Stage Model +// MARK: - Pipeline Stage enum PipelineStage: String, CaseIterable, Identifiable { case motionExtractor = "Motion Extractor" @@ -35,19 +24,6 @@ enum PipelineStage: String, CaseIterable, Identifiable { } } - var description: String { - switch self { - case .motionExtractor: - return "Extracts 3D motion parameters (rotation, expression, translation) from face images" - case .appearanceExtractor: - return "Extracts identity-preserving appearance features from the source portrait" - case .warpingNetwork: - return "Warps source appearance features according to driving motion parameters" - case .spadeGenerator: - return "Generates the final animated frame using SPADE normalization" - } - } - var icon: String { switch self { case .motionExtractor: return "arrow.triangle.branch" @@ -59,11 +35,7 @@ enum PipelineStage: String, CaseIterable, Identifiable { } enum StageStatus: Equatable { - case pending - case running - case completed - case failed(String) - + case pending, running, completed, failed(String) var color: Color { switch self { case .pending: return .gray @@ -74,6 +46,196 @@ enum StageStatus: Equatable { } } +// MARK: - Motion Parameters + +struct MotionInfo { + var kp: [Float] // [63] canonical keypoints + var exp: [Float] // [63] expression + var scale: Float + var t: [Float] // [3] translation + var pitchBins: [Float] // [66] + var yawBins: [Float] // [66] + var rollBins: [Float] // [66] + + var pitch: Float { headposePredToDegree(pitchBins) } + var yaw: Float { headposePredToDegree(yawBins) } + var roll: Float { headposePredToDegree(rollBins) } + var rotMat: [[Float]] { getRotationMatrix(pitch: pitch, yaw: yaw, roll: roll) } +} + +// MARK: - Math Helpers + +func headposePredToDegree(_ pred: [Float]) -> Float { + let maxVal = pred.max() ?? 0 + let exps = pred.map { exp($0 - maxVal) } + let sum = exps.reduce(0, +) + let probs = exps.map { $0 / sum } + var degree: Float = 0 + for i in 0..<66 { degree += probs[i] * Float(i) } + return degree * 3.0 - 97.5 +} + +func getRotationMatrix(pitch: Float, yaw: Float, roll: Float) -> [[Float]] { + let p = pitch * .pi / 180, y = yaw * .pi / 180, r = roll * .pi / 180 + let rx: [[Float]] = [[1,0,0],[0,cos(p),-sin(p)],[0,sin(p),cos(p)]] + let ry: [[Float]] = [[cos(y),0,sin(y)],[0,1,0],[-sin(y),0,cos(y)]] + let rz: [[Float]] = [[cos(r),-sin(r),0],[sin(r),cos(r),0],[0,0,1]] + let zy = matmul3x3(rz, ry) + let zyx = matmul3x3(zy, rx) + return transpose3x3(zyx) +} + +func matmul3x3(_ a: [[Float]], _ b: [[Float]]) -> [[Float]] { + var c = [[Float]](repeating: [Float](repeating: 0, count: 3), count: 3) + for i in 0..<3 { for j in 0..<3 { for k in 0..<3 { c[i][j] += a[i][k] * b[k][j] } } } + return c +} + +func transpose3x3(_ m: [[Float]]) -> [[Float]] { + var r = [[Float]](repeating: [Float](repeating: 0, count: 3), count: 3) + for i in 0..<3 { for j in 0..<3 { r[i][j] = m[j][i] } } + return r +} + +/// kp_transformed = scale * (kp @ R + exp) + t (t.z = 0) +func transformKeypoint(kp: [Float], exp: [Float], scale: Float, t: [Float], rotMat: [[Float]]) -> [Float] { + var result = [Float](repeating: 0, count: 63) + for i in 0..<21 { + var rotated = [Float](repeating: 0, count: 3) + for j in 0..<3 { + for k in 0..<3 { rotated[j] += kp[i*3+k] * rotMat[k][j] } + } + for j in 0..<3 { + result[i*3+j] = scale * (rotated[j] + exp[i*3+j]) + } + result[i*3+0] += t[0] + result[i*3+1] += t[1] + } + return result +} + +// MARK: - Image / MultiArray Helpers + +func imageToMultiArray(_ image: UIImage, size: Int) -> MLMultiArray? { + guard let resized = image.resized(to: CGSize(width: size, height: size)), + let cgImage = resized.cgImage else { return nil } + + let bpr = size * 4 + guard let ctx = CGContext(data: nil, width: size, height: size, bitsPerComponent: 8, + bytesPerRow: bpr, space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue) else { return nil } + ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: size, height: size)) + guard let data = ctx.data else { return nil } + let ptr = data.assumingMemoryBound(to: UInt8.self) + + guard let array = try? MLMultiArray(shape: [1, 3, NSNumber(value: size), NSNumber(value: size)], dataType: .float16) else { return nil } + let count = size * size + let f16 = array.dataPointer.bindMemory(to: UInt16.self, capacity: 3 * count) + + var rBuf = [Float](repeating: 0, count: count) + var gBuf = [Float](repeating: 0, count: count) + var bBuf = [Float](repeating: 0, count: count) + let scale: Float = 1.0 / 255.0 + for y in 0.., count: Int) { + src.withUnsafeBufferPointer { srcBuf in + var srcV = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: srcBuf.baseAddress!), + height: 1, width: vImagePixelCount(count), rowBytes: count * 4) + var dstV = vImage_Buffer(data: UnsafeMutableRawPointer(dst), + height: 1, width: vImagePixelCount(count), rowBytes: count * 2) + vImageConvert_PlanarFtoPlanar16F(&srcV, &dstV, 0) + } +} + +func multiArrayToFloat(_ array: MLMultiArray, count: Int) -> [Float] { + let fp16 = array.dataPointer.bindMemory(to: UInt16.self, capacity: count) + var result = [Float](repeating: 0, count: count) + result.withUnsafeMutableBufferPointer { dstBuf in + var srcV = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: fp16), + height: 1, width: vImagePixelCount(count), rowBytes: count * 2) + var dstV = vImage_Buffer(data: dstBuf.baseAddress!, + height: 1, width: vImagePixelCount(count), rowBytes: count * 4) + vImageConvert_Planar16FtoPlanarF(&srcV, &dstV, 0) + } + return result +} + +func generatedImageToUIImage(_ array: MLMultiArray) -> UIImage? { + // [1, 3, 512, 512] Float16 → UIImage + let size = 512 + let count = size * size + let floats = multiArrayToFloat(array, count: 3 * count) + + var pixels = [UInt8](repeating: 255, count: count * 4) + for i in 0.. MLMultiArray? { + guard let array = try? MLMultiArray(shape: shape, dataType: dataType) else { return nil } + let count = values.count + let dst = array.dataPointer.bindMemory(to: UInt16.self, capacity: count) + convertF32toF16(values, to: dst, count: count) + return array +} + +// MARK: - Video Frame Extraction + +func extractFrames(from url: URL, maxFrames: Int = 30) async -> [UIImage] { + let asset = AVURLAsset(url: url) + guard let track = try? await asset.loadTracks(withMediaType: .video).first, + let duration = try? await asset.load(.duration) else { return [] } + + let fps = (try? await track.load(.nominalFrameRate)) ?? 30 + let totalSeconds = CMTimeGetSeconds(duration) + let totalFrameCount = Int(totalSeconds * Double(fps)) + let step = max(1, totalFrameCount / maxFrames) + let frameCount = min(maxFrames, totalFrameCount) + + let generator = AVAssetImageGenerator(asset: asset) + generator.appliesPreferredTrackTransform = true + generator.requestedTimeToleranceBefore = .zero + generator.requestedTimeToleranceAfter = .zero + + var frames: [UIImage] = [] + for i in 0.. some View { - HStack { - Text(title) - .font(.headline) - Spacer() - } + HStack { Text(title).font(.headline); Spacer() } } private func placeholderView(title: String, systemImage: String) -> some View { VStack(spacing: 12) { - Image(systemName: systemImage) - .font(.system(size: 40)) - .foregroundColor(.secondary) - Text(title) - .foregroundColor(.secondary) + Image(systemName: systemImage).font(.system(size: 40)).foregroundColor(.secondary) + Text(title).foregroundColor(.secondary) } - .frame(maxWidth: .infinity) - .frame(height: 160) - .background(Color(.systemGray6)) - .cornerRadius(12) + .frame(maxWidth: .infinity).frame(height: 160) + .background(Color(.systemGray6)).cornerRadius(12) } } @@ -239,53 +361,21 @@ struct ContentView: View { struct PipelineStageRow: View { let stage: PipelineStage let status: StageStatus - var body: some View { HStack(spacing: 12) { - // Status indicator ZStack { - Circle() - .fill(status.color.opacity(0.2)) - .frame(width: 36, height: 36) - if case .running = status { - ProgressView() - .scaleEffect(0.7) - } else { - Image(systemName: statusIcon) - .font(.caption) - .foregroundColor(status.color) - } + Circle().fill(status.color.opacity(0.2)).frame(width: 32, height: 32) + if case .running = status { ProgressView().scaleEffect(0.6) } + else { Image(systemName: statusIcon).font(.caption2).foregroundColor(status.color) } } - VStack(alignment: .leading, spacing: 2) { - HStack { - Image(systemName: stage.icon) - .font(.caption) - Text(stage.rawValue) - .font(.subheadline) - .fontWeight(.medium) - } - Text(stage.description) - .font(.caption2) - .foregroundColor(.secondary) - .lineLimit(2) - - if case .failed(let msg) = status { - Text(msg) - .font(.caption2) - .foregroundColor(.red) - } + HStack { Image(systemName: stage.icon).font(.caption2); Text(stage.rawValue).font(.caption).fontWeight(.medium) } + if case .failed(let msg) = status { Text(msg).font(.caption2).foregroundColor(.red) } } - Spacer() } - .padding(10) - .background( - RoundedRectangle(cornerRadius: 10) - .fill(Color(.systemGray6)) - ) + .padding(8).background(RoundedRectangle(cornerRadius: 8).fill(Color(.systemGray6))) } - private var statusIcon: String { switch status { case .pending: return "circle" @@ -299,24 +389,36 @@ struct PipelineStageRow: View { // MARK: - ViewModel class LivePortraitViewModel: ObservableObject { - @Published var selectedSourcePhoto: PhotosPickerItem? { - didSet { loadSourceImage() } - } - @Published var selectedDrivingVideo: PhotosPickerItem? { - didSet { loadDrivingVideo() } - } + @Published var selectedSourcePhoto: PhotosPickerItem? { didSet { loadSourceImage() } } + @Published var selectedDrivingVideo: PhotosPickerItem? { didSet { loadDrivingVideo() } } @Published var sourceImage: UIImage? @Published var drivingVideoURL: URL? @Published var drivingThumbnail: UIImage? - @Published var resultImage: UIImage? + @Published var resultFrames: [UIImage] = [] + @Published var currentFrameIndex: Int = 0 + @Published var isPlaying = false @Published var isProcessing = false @Published var errorMessage: String? + @Published var statusMessage = "" @Published var stageStatuses: [PipelineStage: StageStatus] = [:] - init() { - // Initialize all stages as pending - for stage in PipelineStage.allCases { - stageStatuses[stage] = .pending + private var playbackTimer: Timer? + + init() { resetStages() } + + func togglePlayback() { + if isPlaying { + playbackTimer?.invalidate() + playbackTimer = nil + isPlaying = false + } else { + isPlaying = true + playbackTimer = Timer.scheduledTimer(withTimeInterval: 1.0/15.0, repeats: true) { [weak self] _ in + guard let self else { return } + DispatchQueue.main.async { + self.currentFrameIndex = (self.currentFrameIndex + 1) % max(1, self.resultFrames.count) + } + } } } @@ -325,11 +427,7 @@ class LivePortraitViewModel: ObservableObject { Task { if let data = try? await item.loadTransferable(type: Data.self), let image = UIImage(data: data) { - await MainActor.run { - self.sourceImage = image - self.resultImage = nil - self.resetStages() - } + await MainActor.run { self.sourceImage = image; self.resultFrames = []; self.resetStages() } } } } @@ -337,168 +435,224 @@ class LivePortraitViewModel: ObservableObject { private func loadDrivingVideo() { guard let item = selectedDrivingVideo else { return } Task { - // Load video as a Movie transferable if let videoData = try? await item.loadTransferable(type: Data.self) { let tempURL = FileManager.default.temporaryDirectory - .appendingPathComponent(UUID().uuidString) - .appendingPathExtension("mov") + .appendingPathComponent(UUID().uuidString).appendingPathExtension("mov") try? videoData.write(to: tempURL) - - // Generate thumbnail let asset = AVURLAsset(url: tempURL) - let generator = AVAssetImageGenerator(asset: asset) - generator.appliesPreferredTrackTransform = true - let cgImage = try? generator.copyCGImage(at: .zero, actualTime: nil) - + let gen = AVAssetImageGenerator(asset: asset) + gen.appliesPreferredTrackTransform = true + let cg = try? gen.copyCGImage(at: .zero, actualTime: nil) await MainActor.run { self.drivingVideoURL = tempURL - self.drivingThumbnail = cgImage.map { UIImage(cgImage: $0) } - self.resultImage = nil - self.resetStages() + self.drivingThumbnail = cg.map { UIImage(cgImage: $0) } + self.resultFrames = []; self.resetStages() } } } } private func resetStages() { - for stage in PipelineStage.allCases { - stageStatuses[stage] = .pending - } + for stage in PipelineStage.allCases { stageStatuses[stage] = .pending } } func runPipeline() { guard sourceImage != nil, drivingVideoURL != nil else { return } - isProcessing = true - errorMessage = nil - resetStages() - + isProcessing = true; errorMessage = nil; resultFrames = []; resetStages() Task { do { try await executePipeline() - await MainActor.run { - self.isProcessing = false - } + await MainActor.run { self.isProcessing = false; self.statusMessage = "Done" } } catch { - await MainActor.run { - self.errorMessage = error.localizedDescription - self.isProcessing = false - } + await MainActor.run { self.errorMessage = error.localizedDescription; self.isProcessing = false } } } } + // MARK: - Full Pipeline + private func executePipeline() async throws { + guard let sourceImage, let drivingVideoURL else { return } + let config = MLModelConfiguration() - config.computeUnits = .cpuAndNeuralEngine + config.computeUnits = .cpuAndGPU - // Stage 1: Motion Extractor - await setStageStatus(.motionExtractor, .running) - do { - guard let modelURL = Bundle.main.url( - forResource: PipelineStage.motionExtractor.modelFileName, - withExtension: "mlmodelc" - ) else { - throw LivePortraitError.modelNotFound( - "\(PipelineStage.motionExtractor.modelFileName).mlmodelc not found. " + - "Add the compiled model to the project." - ) + // Load all models + func loadModel(_ stage: PipelineStage) throws -> MLModel { + guard let url = Bundle.main.url(forResource: stage.modelFileName, withExtension: "mlmodelc") else { + throw LivePortraitError.modelNotFound("\(stage.modelFileName) not found") } - let _ = try MLModel(contentsOf: modelURL, configuration: config) - - // In production: extract motion params from each driving video frame - // Output: pitch, yaw, roll, expression coefficients, translation vectors - await setStageStatus(.motionExtractor, .completed) - } catch { - await setStageStatus(.motionExtractor, .failed(error.localizedDescription)) - throw error + return try MLModel(contentsOf: url, configuration: config) + } + + let motionModel = try loadModel(.motionExtractor) + let appearanceModel = try loadModel(.appearanceExtractor) + let warpingModel = try loadModel(.warpingNetwork) + let spadeModel = try loadModel(.spadeGenerator) + + // Prepare source image (256x256) + guard let srcArray = imageToMultiArray(sourceImage, size: 256) else { + throw LivePortraitError.processingFailed("Failed to preprocess source image") } - // Stage 2: Appearance Extractor + // Stage 1: Extract source motion + await setStageStatus(.motionExtractor, .running) + await setStatus("Extracting source motion...") + + let srcMotionInput = try MLDictionaryFeatureProvider(dictionary: ["image": MLFeatureValue(multiArray: srcArray)]) + let srcMotionOut = try motionModel.prediction(from: srcMotionInput) + let srcMotion = extractMotionInfo(srcMotionOut) + let srcR = srcMotion.rotMat + let kpSource = transformKeypoint(kp: srcMotion.kp, exp: srcMotion.exp, scale: srcMotion.scale, + t: srcMotion.t, rotMat: srcR) + + // Extract driving video frames + await setStatus("Extracting video frames...") + let drivingFrames = await extractFrames(from: drivingVideoURL, maxFrames: 30) + guard !drivingFrames.isEmpty else { + throw LivePortraitError.processingFailed("Could not extract frames from driving video") + } + + // Extract motion from first driving frame (reference) + guard let drv0Array = imageToMultiArray(drivingFrames[0], size: 256) else { + throw LivePortraitError.processingFailed("Failed to preprocess driving frame 0") + } + let drv0Input = try MLDictionaryFeatureProvider(dictionary: ["image": MLFeatureValue(multiArray: drv0Array)]) + let drv0Out = try motionModel.prediction(from: drv0Input) + let drv0Motion = extractMotionInfo(drv0Out) + let drv0R = drv0Motion.rotMat + + await setStageStatus(.motionExtractor, .completed) + + // Stage 2: Extract source appearance (once) await setStageStatus(.appearanceExtractor, .running) - do { - guard let modelURL = Bundle.main.url( - forResource: PipelineStage.appearanceExtractor.modelFileName, - withExtension: "mlmodelc" - ) else { - throw LivePortraitError.modelNotFound( - "\(PipelineStage.appearanceExtractor.modelFileName).mlmodelc not found. " + - "Add the compiled model to the project." - ) - } - let _ = try MLModel(contentsOf: modelURL, configuration: config) - - // In production: extract appearance feature volume from source portrait - // This is done once and reused for all frames - await setStageStatus(.appearanceExtractor, .completed) - } catch { - await setStageStatus(.appearanceExtractor, .failed(error.localizedDescription)) - throw error + await setStatus("Extracting appearance features...") + + let appInput = try MLDictionaryFeatureProvider(dictionary: ["source_image": MLFeatureValue(multiArray: srcArray)]) + let appOut = try appearanceModel.prediction(from: appInput) + guard let feature3d = appOut.featureValue(for: "feature_3d")?.multiArrayValue else { + throw LivePortraitError.processingFailed("Failed to extract feature_3d") } - // Stage 3: Warping Network - await setStageStatus(.warpingNetwork, .running) - do { - guard let modelURL = Bundle.main.url( - forResource: PipelineStage.warpingNetwork.modelFileName, - withExtension: "mlmodelc" - ) else { - throw LivePortraitError.modelNotFound( - "\(PipelineStage.warpingNetwork.modelFileName).mlmodelc not found. " + - "Add the compiled model to the project." - ) - } - let _ = try MLModel(contentsOf: modelURL, configuration: config) - - // In production: warp source appearance features using - // the delta between source and driving motion parameters - await setStageStatus(.warpingNetwork, .completed) - } catch { - await setStageStatus(.warpingNetwork, .failed(error.localizedDescription)) - throw error + await setStageStatus(.appearanceExtractor, .completed) + + // Prepare kp_source as MLMultiArray [1, 21, 3] + guard let kpSourceArray = flatToMLMultiArray(shape: [1, 21, 3], values: kpSource) else { + throw LivePortraitError.processingFailed("Failed to create kp_source array") } - // Stage 4: SPADE Generator + // Stage 3 & 4: Process each driving frame + await setStageStatus(.warpingNetwork, .running) await setStageStatus(.spadeGenerator, .running) - do { - guard let modelURL = Bundle.main.url( - forResource: PipelineStage.spadeGenerator.modelFileName, - withExtension: "mlmodelc" - ) else { - throw LivePortraitError.modelNotFound( - "\(PipelineStage.spadeGenerator.modelFileName).mlmodelc not found. " + - "Add the compiled model to the project." - ) - } - let _ = try MLModel(contentsOf: modelURL, configuration: config) - // In production: generate final animated frame from warped features - // using SPADE (Spatially-Adaptive Normalization) decoder + var outputFrames: [UIImage] = [] + + for (i, frame) in drivingFrames.enumerated() { + await setStatus("Frame \(i+1)/\(drivingFrames.count)...") + + // Extract driving motion + guard let drvArray = imageToMultiArray(frame, size: 256) else { continue } + let drvInput = try MLDictionaryFeatureProvider(dictionary: ["image": MLFeatureValue(multiArray: drvArray)]) + let drvOut = try motionModel.prediction(from: drvInput) + let drvMotion = extractMotionInfo(drvOut) + let drvR = drvMotion.rotMat + + // Relative motion: R_new = (R_drv_i @ R_drv_0^T) @ R_src + let drv0RT = transpose3x3(drv0R) + let deltaR = matmul3x3(drvR, drv0RT) + let rNew = matmul3x3(deltaR, srcR) + + // Relative expression, scale, translation + var expNew = [Float](repeating: 0, count: 63) + for j in 0..<63 { expNew[j] = srcMotion.exp[j] + (drvMotion.exp[j] - drv0Motion.exp[j]) } + let scaleNew = srcMotion.scale * (drvMotion.scale / drv0Motion.scale) + var tNew = [Float](repeating: 0, count: 3) + tNew[0] = srcMotion.t[0] + (drvMotion.t[0] - drv0Motion.t[0]) + tNew[1] = srcMotion.t[1] + (drvMotion.t[1] - drv0Motion.t[1]) + tNew[2] = 0 + + let kpDriving = transformKeypoint(kp: srcMotion.kp, exp: expNew, scale: scaleNew, + t: tNew, rotMat: rNew) + + guard let kpDrivingArray = flatToMLMultiArray(shape: [1, 21, 3], values: kpDriving) else { continue } + + // Warping + let warpInput = try MLDictionaryFeatureProvider(dictionary: [ + "feature_3d": MLFeatureValue(multiArray: feature3d), + "kp_driving": MLFeatureValue(multiArray: kpDrivingArray), + "kp_source": MLFeatureValue(multiArray: kpSourceArray) + ]) + let warpOut = try warpingModel.prediction(from: warpInput) + guard let warpedFeature = warpOut.featureValue(for: "warped_feature")?.multiArrayValue else { continue } + + // SPADE Generator + let spadeInput = try MLDictionaryFeatureProvider(dictionary: [ + "warped_feature": MLFeatureValue(multiArray: warpedFeature) + ]) + let spadeOut = try spadeModel.prediction(from: spadeInput) + guard let genImage = spadeOut.featureValue(for: "generated_image")?.multiArrayValue else { continue } + + if let uiImage = generatedImageToUIImage(genImage) { + outputFrames.append(uiImage) + } - // For demo, use the source image as placeholder result - await MainActor.run { - self.resultImage = self.sourceImage + // Update UI periodically + if i % 3 == 0 || i == drivingFrames.count - 1 { + let frames = outputFrames + await MainActor.run { self.resultFrames = frames } } - await setStageStatus(.spadeGenerator, .completed) - } catch { - await setStageStatus(.spadeGenerator, .failed(error.localizedDescription)) - throw error + } + + await setStageStatus(.warpingNetwork, .completed) + await setStageStatus(.spadeGenerator, .completed) + + let finalFrames = outputFrames + await MainActor.run { + self.resultFrames = finalFrames + self.currentFrameIndex = 0 } } + private func extractMotionInfo(_ output: MLFeatureProvider) -> MotionInfo { + let pitchArr = output.featureValue(for: "pitch")!.multiArrayValue! + let yawArr = output.featureValue(for: "yaw")!.multiArrayValue! + let rollArr = output.featureValue(for: "roll")!.multiArrayValue! + let tArr = output.featureValue(for: "t")!.multiArrayValue! + let expArr = output.featureValue(for: "exp")!.multiArrayValue! + let scaleArr = output.featureValue(for: "scale")!.multiArrayValue! + let kpArr = output.featureValue(for: "kp")!.multiArrayValue! + + return MotionInfo( + kp: multiArrayToFloat(kpArr, count: 63), + exp: multiArrayToFloat(expArr, count: 63), + scale: multiArrayToFloat(scaleArr, count: 1)[0], + t: multiArrayToFloat(tArr, count: 3), + pitchBins: multiArrayToFloat(pitchArr, count: 66), + yawBins: multiArrayToFloat(yawArr, count: 66), + rollBins: multiArrayToFloat(rollArr, count: 66) + ) + } + @MainActor private func setStageStatus(_ stage: PipelineStage, _ status: StageStatus) { stageStatuses[stage] = status } + + @MainActor + private func setStatus(_ msg: String) { + statusMessage = msg + } } +// MARK: - Errors + enum LivePortraitError: LocalizedError { case modelNotFound(String) case processingFailed(String) - var errorDescription: String? { switch self { - case .modelNotFound(let msg): return msg - case .processingFailed(let msg): return msg + case .modelNotFound(let m): return m + case .processingFailed(let m): return m } } } @@ -508,12 +662,8 @@ enum LivePortraitError: LocalizedError { extension UIImage { func resized(to targetSize: CGSize) -> UIImage? { let renderer = UIGraphicsImageRenderer(size: targetSize) - return renderer.image { _ in - self.draw(in: CGRect(origin: .zero, size: targetSize)) - } + return renderer.image { _ in self.draw(in: CGRect(origin: .zero, size: targetSize)) } } } -#Preview { - ContentView() -} +#Preview { ContentView() } diff --git a/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj b/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj deleted file mode 100644 index b0f321f..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo.xcodeproj/project.pbxproj +++ /dev/null @@ -1,274 +0,0 @@ -// !$*UTF8*$! -{ - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { - -/* Begin PBXBuildFile section */ - WH0001 /* WhisperDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0002; }; - WH0003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = WH0004; }; - WH0005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = WH0006; }; - WHML02 /* WhisperTinyEncoder.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = WHML01 /* WhisperTinyEncoder.mlpackage */; }; -/* End PBXBuildFile section */ - -/* Begin PBXFileReference section */ - WH0007 /* WhisperDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = WhisperDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; - WH0002 /* WhisperDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperDemoApp.swift; sourceTree = ""; }; - WH0004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - WH0006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - WH0008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; - WHML01 /* WhisperTinyEncoder.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = WhisperTinyEncoder.mlpackage; sourceTree = ""; }; -/* End PBXFileReference section */ - -/* Begin PBXFrameworksBuildPhase section */ - WH0009 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXFrameworksBuildPhase section */ - -/* Begin PBXGroup section */ - WH0010 = { - isa = PBXGroup; - children = ( - WH0011 /* WhisperDemo */, - WH0012 /* Products */, - ); - sourceTree = ""; - }; - WH0011 /* WhisperDemo */ = { - isa = PBXGroup; - children = ( - WH0002 /* WhisperDemoApp.swift */, - WH0004 /* ContentView.swift */, - WH0006 /* Assets.xcassets */, - WH0008 /* Info.plist */, - WHML01 /* WhisperTinyEncoder.mlpackage */, - ); - path = WhisperDemo; - sourceTree = ""; - }; - WH0012 /* Products */ = { - isa = PBXGroup; - children = ( - WH0007 /* WhisperDemo.app */, - ); - name = Products; - sourceTree = ""; - }; -/* End PBXGroup section */ - -/* Begin PBXNativeTarget section */ - WH0013 /* WhisperDemo */ = { - isa = PBXNativeTarget; - buildConfigurationList = WH0014; - buildPhases = ( - WH0015 /* Sources */, - WH0009 /* Frameworks */, - WH0016 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = WhisperDemo; - productName = WhisperDemo; - productReference = WH0007; - productType = "com.apple.product-type.application"; - }; -/* End PBXNativeTarget section */ - -/* Begin PBXProject section */ - WH0017 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - WH0013 = { - CreatedOnToolsVersion = 15.0; - }; - }; - }; - buildConfigurationList = WH0018; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = WH0010; - productRefGroup = WH0012; - projectDirPath = ""; - projectRoot = ""; - targets = ( - WH0013, - ); - }; -/* End PBXProject section */ - -/* Begin PBXResourcesBuildPhase section */ - WH0016 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - WH0005 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXResourcesBuildPhase section */ - -/* Begin PBXSourcesBuildPhase section */ - WH0015 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - WH0001 /* WhisperDemoApp.swift in Sources */, - WH0003 /* ContentView.swift in Sources */, - WHML02 /* WhisperTinyEncoder.mlpackage in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXSourcesBuildPhase section */ - -/* Begin XCBuildConfiguration section */ - WH0019 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - GCC_DYNAMIC_NO_PIC = NO; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - WH0020 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - SWIFT_OPTIMIZATION_LEVEL = "-O"; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - WH0021 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = WhisperDemo/Info.plist; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.whisperdemo"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - WH0022 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = WhisperDemo/Info.plist; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.whisperdemo"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; -/* End XCBuildConfiguration section */ - -/* Begin XCConfigurationList section */ - WH0018 /* Build configuration list for PBXProject */ = { - isa = XCConfigurationList; - buildConfigurations = ( - WH0019, - WH0020, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - WH0014 /* Build configuration list for PBXNativeTarget */ = { - isa = XCConfigurationList; - buildConfigurations = ( - WH0021, - WH0022, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; -/* End XCConfigurationList section */ - - }; - rootObject = WH0017; -} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json deleted file mode 100644 index eb87897..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AccentColor.colorset/Contents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "colors" : [ - { - "idiom" : "universal" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json b/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json deleted file mode 100644 index 73c0059..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/Contents.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift b/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift deleted file mode 100644 index d249cc9..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo/ContentView.swift +++ /dev/null @@ -1,830 +0,0 @@ -import SwiftUI -import CoreML -import AVFoundation -import Accelerate - -// MARK: - Whisper Tiny Speech Recognition Demo -// -// Whisper is a general-purpose speech recognition model by OpenAI. -// The encoder processes a mel spectrogram (80 bins x 3000 frames for 30s of audio) -// and produces hidden states that the decoder uses autoregressively to generate tokens. -// -// This demo records audio via the microphone, computes a log-mel spectrogram using -// the Accelerate framework (vDSP), runs the WhisperTiny encoder CoreML model, and -// displays transcription results. The decoder step is simplified for demonstration; -// a production app should use WhisperKit or a full encoder+decoder pipeline. - -// MARK: - Supported Languages - -enum WhisperLanguage: String, CaseIterable, Identifiable { - case english = "English" - case japanese = "Japanese" - case spanish = "Spanish" - case french = "French" - case german = "German" - case chinese = "Chinese" - case korean = "Korean" - case portuguese = "Portuguese" - - var id: String { rawValue } - - var code: String { - switch self { - case .english: return "en" - case .japanese: return "ja" - case .spanish: return "es" - case .french: return "fr" - case .german: return "de" - case .chinese: return "zh" - case .korean: return "ko" - case .portuguese: return "pt" - } - } -} - -// MARK: - Transcription Entry - -struct TranscriptionEntry: Identifiable { - let id = UUID() - let text: String - let language: WhisperLanguage - let timestamp: Date - let duration: TimeInterval -} - -// MARK: - ContentView - -struct ContentView: View { - @StateObject private var viewModel = WhisperViewModel() - - var body: some View { - NavigationStack { - VStack(spacing: 0) { - // Language picker - HStack { - Text("Language") - .font(.subheadline) - .foregroundColor(.secondary) - Spacer() - Picker("Language", selection: $viewModel.selectedLanguage) { - ForEach(WhisperLanguage.allCases) { lang in - Text(lang.rawValue).tag(lang) - } - } - .pickerStyle(.menu) - } - .padding(.horizontal) - .padding(.top, 8) - - Divider() - .padding(.vertical, 8) - - // Waveform visualization - WaveformVisualization( - samples: viewModel.audioSamples, - isRecording: viewModel.isRecording - ) - .frame(height: 100) - .padding(.horizontal) - .padding(.bottom, 8) - - // Recording controls - VStack(spacing: 12) { - RecordButton( - isRecording: viewModel.isRecording, - onTap: { - if viewModel.isRecording { - viewModel.stopRecording() - } else { - viewModel.startRecording() - } - } - ) - - Text(viewModel.isRecording ? "Tap to stop recording" : "Tap to start recording") - .font(.caption) - .foregroundColor(.secondary) - - if viewModel.isRecording { - Text(viewModel.formattedRecordingDuration) - .font(.system(.title3, design: .monospaced)) - .foregroundColor(.red) - } - } - .padding(.vertical, 12) - - // Processing indicator - if viewModel.isProcessing { - VStack(spacing: 8) { - ProgressView() - .scaleEffect(1.2) - Text(viewModel.processingStatus) - .font(.caption) - .foregroundColor(.secondary) - ProgressView(value: viewModel.processingProgress) - .progressViewStyle(.linear) - .padding(.horizontal, 40) - } - .padding() - } - - // Error display - if let error = viewModel.errorMessage { - HStack { - Image(systemName: "exclamationmark.triangle.fill") - .foregroundColor(.red) - Text(error) - .font(.caption) - .foregroundColor(.red) - } - .padding() - .frame(maxWidth: .infinity) - .background(Color.red.opacity(0.1)) - .cornerRadius(8) - .padding(.horizontal) - } - - // Current transcription result - if let current = viewModel.currentTranscription { - VStack(alignment: .leading, spacing: 8) { - HStack { - Text("Transcription") - .font(.headline) - Spacer() - Button(action: { viewModel.copyToClipboard(current.text) }) { - Image(systemName: "doc.on.doc") - .font(.body) - } - } - Text(current.text) - .font(.body) - .padding() - .frame(maxWidth: .infinity, alignment: .leading) - .background(Color(.systemGray6)) - .cornerRadius(10) - HStack { - Text(current.language.rawValue) - .font(.caption2) - .padding(.horizontal, 8) - .padding(.vertical, 2) - .background(Color.accentColor.opacity(0.15)) - .cornerRadius(4) - Text(formatDuration(current.duration)) - .font(.caption2) - .foregroundColor(.secondary) - } - } - .padding(.horizontal) - .padding(.vertical, 8) - } - - Divider() - .padding(.vertical, 4) - - // History list - if viewModel.transcriptionHistory.isEmpty && viewModel.currentTranscription == nil { - Spacer() - VStack(spacing: 12) { - Image(systemName: "waveform.circle") - .font(.system(size: 48)) - .foregroundColor(.secondary.opacity(0.5)) - Text("Record audio to begin transcription") - .font(.subheadline) - .foregroundColor(.secondary) - } - Spacer() - } else { - ScrollView { - LazyVStack(spacing: 10) { - ForEach(viewModel.transcriptionHistory) { entry in - TranscriptionRow( - entry: entry, - onCopy: { viewModel.copyToClipboard(entry.text) } - ) - } - } - .padding(.horizontal) - .padding(.vertical, 8) - } - } - } - .navigationTitle("Whisper Transcribe") - .toolbar { - ToolbarItem(placement: .navigationBarTrailing) { - if !viewModel.transcriptionHistory.isEmpty { - Button("Clear") { - viewModel.clearHistory() - } - } - } - } - .onAppear { - viewModel.requestMicrophonePermission() - } - } - } - - private func formatDuration(_ duration: TimeInterval) -> String { - let seconds = Int(duration) - let ms = Int((duration - Double(seconds)) * 10) - return String(format: "%d.%ds", seconds, ms) - } -} - -// MARK: - Record Button - -struct RecordButton: View { - let isRecording: Bool - let onTap: () -> Void - - var body: some View { - Button(action: onTap) { - ZStack { - Circle() - .fill(isRecording ? Color.red.opacity(0.15) : Color.accentColor.opacity(0.1)) - .frame(width: 80, height: 80) - - Circle() - .fill(isRecording ? Color.red : Color.accentColor) - .frame(width: 60, height: 60) - - if isRecording { - RoundedRectangle(cornerRadius: 4) - .fill(Color.white) - .frame(width: 22, height: 22) - } else { - Circle() - .fill(Color.white) - .frame(width: 24, height: 24) - } - } - } - .buttonStyle(.plain) - .animation(.easeInOut(duration: 0.2), value: isRecording) - } -} - -// MARK: - Waveform Visualization - -struct WaveformVisualization: View { - let samples: [Float] - let isRecording: Bool - @State private var animationPhase: CGFloat = 0 - - var body: some View { - TimelineView(.animation(minimumInterval: 1.0 / 30.0)) { timeline in - Canvas { context, size in - let midY = size.height / 2 - let barWidth: CGFloat = 3 - let gap: CGFloat = 2 - let totalBarWidth = barWidth + gap - let barCount = Int(size.width / totalBarWidth) - - if isRecording && !samples.isEmpty { - let step = max(1, samples.count / barCount) - for i in 0.. Void - - var body: some View { - VStack(alignment: .leading, spacing: 6) { - HStack { - Text(entry.language.rawValue) - .font(.caption2) - .fontWeight(.medium) - .padding(.horizontal, 6) - .padding(.vertical, 2) - .background(Color.accentColor.opacity(0.12)) - .cornerRadius(4) - - Text(entry.timestamp, style: .time) - .font(.caption2) - .foregroundColor(.secondary) - - Spacer() - - Button(action: onCopy) { - Image(systemName: "doc.on.doc") - .font(.caption) - .foregroundColor(.secondary) - } - } - - Text(entry.text) - .font(.body) - .lineLimit(4) - } - .padding() - .background(Color(.systemGray6)) - .cornerRadius(10) - } -} - -// MARK: - Clamped Extension - -private extension Double { - func clamped(to range: ClosedRange) -> Double { - return min(max(self, range.lowerBound), range.upperBound) - } -} - -// MARK: - WhisperViewModel - -class WhisperViewModel: ObservableObject { - @Published var selectedLanguage: WhisperLanguage = .english - @Published var isRecording = false - @Published var isProcessing = false - @Published var processingStatus = "" - @Published var processingProgress: Double = 0 - @Published var errorMessage: String? - @Published var currentTranscription: TranscriptionEntry? - @Published var transcriptionHistory: [TranscriptionEntry] = [] - @Published var audioSamples: [Float] = [] - @Published var recordingDuration: TimeInterval = 0 - - private var audioRecorder: AVAudioRecorder? - private var recordingURL: URL? - private var recordingTimer: Timer? - private var sampleTimer: Timer? - private var recordingStartTime: Date? - - var formattedRecordingDuration: String { - let minutes = Int(recordingDuration) / 60 - let seconds = Int(recordingDuration) % 60 - let tenths = Int((recordingDuration - floor(recordingDuration)) * 10) - return String(format: "%d:%02d.%d", minutes, seconds, tenths) - } - - // MARK: - Microphone Permission - - func requestMicrophonePermission() { - AVAudioSession.sharedInstance().requestRecordPermission { [weak self] granted in - DispatchQueue.main.async { - if !granted { - self?.errorMessage = "Microphone access denied. Please enable it in Settings." - } - } - } - } - - // MARK: - Recording - - func startRecording() { - errorMessage = nil - currentTranscription = nil - - let session = AVAudioSession.sharedInstance() - do { - try session.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker]) - try session.setActive(true) - } catch { - errorMessage = "Failed to configure audio session: \(error.localizedDescription)" - return - } - - let tempDir = FileManager.default.temporaryDirectory - let fileName = "whisper_recording_\(UUID().uuidString).wav" - let fileURL = tempDir.appendingPathComponent(fileName) - recordingURL = fileURL - - let settings: [String: Any] = [ - AVFormatIDKey: Int(kAudioFormatLinearPCM), - AVSampleRateKey: 16000.0, - AVNumberOfChannelsKey: 1, - AVLinearPCMBitDepthKey: 16, - AVLinearPCMIsFloatKey: false, - AVLinearPCMIsBigEndianKey: false - ] - - do { - audioRecorder = try AVAudioRecorder(url: fileURL, settings: settings) - audioRecorder?.isMeteringEnabled = true - audioRecorder?.record() - isRecording = true - recordingStartTime = Date() - recordingDuration = 0 - audioSamples = [] - startTimers() - } catch { - errorMessage = "Failed to start recording: \(error.localizedDescription)" - } - } - - func stopRecording() { - guard isRecording else { return } - - audioRecorder?.stop() - isRecording = false - stopTimers() - - let duration = recordingDuration - - guard let url = recordingURL else { - errorMessage = "Recording file not found." - return - } - - processRecording(url: url, duration: duration) - } - - private func startTimers() { - recordingTimer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in - guard let self = self, let start = self.recordingStartTime else { return } - DispatchQueue.main.async { - self.recordingDuration = Date().timeIntervalSince(start) - } - } - - sampleTimer = Timer.scheduledTimer(withTimeInterval: 0.05, repeats: true) { [weak self] _ in - guard let self = self else { return } - self.audioRecorder?.updateMeters() - let power = self.audioRecorder?.averagePower(forChannel: 0) ?? -160 - // Convert dB to linear amplitude (0..1) - let linear = pow(10, power / 20) - DispatchQueue.main.async { - self.audioSamples.append(linear) - // Keep a rolling window of samples for visualization - if self.audioSamples.count > 400 { - self.audioSamples.removeFirst(self.audioSamples.count - 400) - } - } - } - } - - private func stopTimers() { - recordingTimer?.invalidate() - recordingTimer = nil - sampleTimer?.invalidate() - sampleTimer = nil - } - - // MARK: - Audio Processing - - private func processRecording(url: URL, duration: TimeInterval) { - isProcessing = true - errorMessage = nil - processingProgress = 0 - processingStatus = "Loading audio..." - - Task { - do { - let transcription = try await runWhisperPipeline(url: url, duration: duration) - await MainActor.run { - let entry = TranscriptionEntry( - text: transcription, - language: self.selectedLanguage, - timestamp: Date(), - duration: duration - ) - self.currentTranscription = entry - self.transcriptionHistory.insert(entry, at: 0) - self.isProcessing = false - self.processingProgress = 1.0 - } - } catch { - await MainActor.run { - self.errorMessage = error.localizedDescription - self.isProcessing = false - } - } - - // Clean up temp file - try? FileManager.default.removeItem(at: url) - } - } - - /// Full Whisper pipeline: load audio -> compute mel spectrogram -> run encoder -> decode - /// - /// NOTE: The decoder step is simplified here. A full implementation would: - /// 1. Feed encoder output into the decoder model autoregressively - /// 2. Use greedy or beam search to generate token IDs - /// 3. Decode token IDs using the Whisper tokenizer - /// For production use, consider WhisperKit (github.com/argmaxinc/WhisperKit). - private func runWhisperPipeline(url: URL, duration: TimeInterval) async throws -> String { - // Step 1: Load audio samples from WAV file - await updateProgress("Loading audio file...", progress: 0.1) - - let audioData = try loadAudioSamples(from: url) - - // Step 2: Compute log-mel spectrogram using Accelerate - await updateProgress("Computing mel spectrogram...", progress: 0.3) - - let melSpectrogram = try computeMelSpectrogram(from: audioData) - - // Step 3: Load and run encoder model - await updateProgress("Running Whisper encoder...", progress: 0.5) - - guard let modelURL = Bundle.main.url(forResource: "WhisperTinyEncoder", withExtension: "mlmodelc") else { - throw WhisperError.modelNotFound( - "WhisperTinyEncoder.mlmodelc not found in bundle. " + - "Run convert_whisper.py to generate the model, then add the compiled " + - "WhisperTinyEncoder.mlmodelc to the Xcode project." - ) - } - - let config = MLModelConfiguration() - config.computeUnits = .cpuAndNeuralEngine - let model = try MLModel(contentsOf: modelURL, configuration: config) - - // Prepare mel input: shape (1, 80, 3000) - let melInput = try MLMultiArray(shape: [1, 80, 3000], dataType: .float32) - let melCount = min(melSpectrogram.count, 80 * 3000) - for i in 0.. [Float] { - let fileData = try Data(contentsOf: url) - - // WAV header is 44 bytes; PCM 16-bit mono samples follow - guard fileData.count > 44 else { - throw WhisperError.processingFailed("Audio file too short or corrupted.") - } - - let sampleData = fileData.dropFirst(44) - let sampleCount = sampleData.count / 2 // 16-bit = 2 bytes per sample - - var floatSamples = [Float](repeating: 0, count: sampleCount) - sampleData.withUnsafeBytes { rawBuffer in - guard let baseAddress = rawBuffer.baseAddress else { return } - let int16Ptr = baseAddress.bindMemory(to: Int16.self, capacity: sampleCount) - // Convert Int16 samples to Float32 normalized to [-1, 1] - var source = UnsafePointer(int16Ptr) - var destination = UnsafeMutablePointer(&floatSamples) - // Use vDSP for efficient conversion - vDSP_vflt16(source, 1, &floatSamples, 1, vDSP_Length(sampleCount)) - var scale: Float = 1.0 / 32768.0 - vDSP_vsmul(floatSamples, 1, &scale, &floatSamples, 1, vDSP_Length(sampleCount)) - } - - return floatSamples - } - - /// Compute 80-bin log-mel spectrogram from audio samples - /// - /// Whisper expects: 80 mel bins, 3000 time frames (for 30s at 16kHz with hop=160). - /// Parameters: FFT size = 400, hop length = 160, sample rate = 16000. - /// - /// This implementation uses Accelerate's vDSP for the FFT computation. - private func computeMelSpectrogram(from samples: [Float]) throws -> [Float] { - let fftSize = 400 - let hopLength = 160 - let numMelBins = 80 - let maxFrames = 3000 - let sampleRate: Float = 16000.0 - - // Pad or truncate audio to 30 seconds (480000 samples) - let targetLength = 480000 - var paddedSamples: [Float] - if samples.count >= targetLength { - paddedSamples = Array(samples.prefix(targetLength)) - } else { - paddedSamples = samples + [Float](repeating: 0, count: targetLength - samples.count) - } - - // Number of frames - let numFrames = min((paddedSamples.count - fftSize) / hopLength + 1, maxFrames) - - // Create FFT setup - let log2n = vDSP_Length(ceil(log2(Float(fftSize)))) - guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { - throw WhisperError.processingFailed("Failed to create FFT setup.") - } - defer { vDSP_destroy_fftsetup(fftSetup) } - - let fftSizeAligned = Int(pow(2, ceil(log2(Float(fftSize))))) - let halfFFT = fftSizeAligned / 2 - - // Hann window - var window = [Float](repeating: 0, count: fftSize) - vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM)) - - // Compute mel filter bank (simplified triangular filters) - let melFilters = createMelFilterBank( - numMelBins: numMelBins, - fftSize: fftSizeAligned, - sampleRate: sampleRate, - numFreqBins: halfFFT + 1 - ) - - // Output: (numMelBins x numFrames) stored row-major - var melSpectrogram = [Float](repeating: 0, count: numMelBins * maxFrames) - - // Process each frame - for frame in 0.. [Float] { - func hzToMel(_ hz: Float) -> Float { - return 2595.0 * log10(1.0 + hz / 700.0) - } - - func melToHz(_ mel: Float) -> Float { - return 700.0 * (pow(10.0, mel / 2595.0) - 1.0) - } - - let lowFreq: Float = 0 - let highFreq = sampleRate / 2.0 - let lowMel = hzToMel(lowFreq) - let highMel = hzToMel(highFreq) - - // Equally spaced mel points - let numPoints = numMelBins + 2 - var melPoints = [Float](repeating: 0, count: numPoints) - for i in 0.. left { - filters[m * numFreqBins + k] = Float(k - left) / Float(center - left) - } - } - for k in center.. center { - filters[m * numFreqBins + k] = Float(right - k) / Float(right - center) - } - } - } - - return filters - } - - @MainActor - private func updateProgress(_ status: String, progress: Double) { - self.processingStatus = status - self.processingProgress = progress - } - - // MARK: - Clipboard - - func copyToClipboard(_ text: String) { - UIPasteboard.general.string = text - } - - // MARK: - History - - func clearHistory() { - transcriptionHistory.removeAll() - currentTranscription = nil - } -} - -// MARK: - Errors - -enum WhisperError: LocalizedError { - case modelNotFound(String) - case processingFailed(String) - - var errorDescription: String? { - switch self { - case .modelNotFound(let msg): return msg - case .processingFailed(let msg): return msg - } - } -} - -#Preview { - ContentView() -} diff --git a/creative_apps/WhisperDemo/WhisperDemo/Info.plist b/creative_apps/WhisperDemo/WhisperDemo/Info.plist deleted file mode 100644 index 53711f3..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo/Info.plist +++ /dev/null @@ -1,8 +0,0 @@ - - - - - NSMicrophoneUsageDescription - This app needs microphone access for speech recognition. - - diff --git a/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift b/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift deleted file mode 100644 index 6468c10..0000000 --- a/creative_apps/WhisperDemo/WhisperDemo/WhisperDemoApp.swift +++ /dev/null @@ -1,10 +0,0 @@ -import SwiftUI - -@main -struct WhisperDemoApp: App { - var body: some Scene { - WindowGroup { - ContentView() - } - } -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj b/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj deleted file mode 100644 index adab51d..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo.xcodeproj/project.pbxproj +++ /dev/null @@ -1,340 +0,0 @@ -// !$*UTF8*$! -{ - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { - -/* Begin PBXBuildFile section */ - DW0000010000000000000001 /* DWPoseDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000001 /* DWPoseDemoApp.swift */; }; - DW0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000002 /* ContentView.swift */; }; - DW0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DW0000020000000000000004 /* Assets.xcassets */; }; -/* End PBXBuildFile section */ - -/* Begin PBXFileReference section */ - DW0000020000000000000001 /* DWPoseDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DWPoseDemoApp.swift; sourceTree = ""; }; - DW0000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - DW0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - DW0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; - DW0000020000000000000010 /* DWPoseDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DWPoseDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; -/* End PBXFileReference section */ - -/* Begin PBXFrameworksBuildPhase section */ - DW0000030000000000000001 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXFrameworksBuildPhase section */ - -/* Begin PBXGroup section */ - DW0000040000000000000001 = { - isa = PBXGroup; - children = ( - DW0000040000000000000002 /* DWPoseDemo */, - DW0000040000000000000003 /* Products */, - ); - sourceTree = ""; - }; - DW0000040000000000000002 /* DWPoseDemo */ = { - isa = PBXGroup; - children = ( - DW0000020000000000000001 /* DWPoseDemoApp.swift */, - DW0000020000000000000002 /* ContentView.swift */, - DW0000020000000000000004 /* Assets.xcassets */, - DW0000020000000000000005 /* Info.plist */, - ); - path = DWPoseDemo; - sourceTree = ""; - }; - DW0000040000000000000003 /* Products */ = { - isa = PBXGroup; - children = ( - DW0000020000000000000010 /* DWPoseDemo.app */, - ); - name = Products; - sourceTree = ""; - }; -/* End PBXGroup section */ - -/* Begin PBXNativeTarget section */ - DW0000050000000000000001 /* DWPoseDemo */ = { - isa = PBXNativeTarget; - buildConfigurationList = DW0000070000000000000001 /* Build configuration list for PBXNativeTarget "DWPoseDemo" */; - buildPhases = ( - DW0000060000000000000001 /* Sources */, - DW0000030000000000000001 /* Frameworks */, - DW0000060000000000000002 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = DWPoseDemo; - productName = DWPoseDemo; - productReference = DW0000020000000000000010 /* DWPoseDemo.app */; - productType = "com.apple.product-type.application"; - }; -/* End PBXNativeTarget section */ - -/* Begin PBXProject section */ - DW0000080000000000000001 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - DW0000050000000000000001 = { - CreatedOnToolsVersion = 15.0; - }; - }; - }; - buildConfigurationList = DW0000070000000000000003 /* Build configuration list for PBXProject "DWPoseDemo" */; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = DW0000040000000000000001; - productRefGroup = DW0000040000000000000003 /* Products */; - projectDirPath = ""; - projectRoot = ""; - targets = ( - DW0000050000000000000001 /* DWPoseDemo */, - ); - }; -/* End PBXProject section */ - -/* Begin PBXResourcesBuildPhase section */ - DW0000060000000000000002 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DW0000010000000000000004 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXResourcesBuildPhase section */ - -/* Begin PBXSourcesBuildPhase section */ - DW0000060000000000000001 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DW0000010000000000000001 /* DWPoseDemoApp.swift in Sources */, - DW0000010000000000000002 /* ContentView.swift in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXSourcesBuildPhase section */ - -/* Begin XCBuildConfiguration section */ - DW0000090000000000000001 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_DYNAMIC_NO_PIC = NO; - GCC_NO_COMMON_BLOCKS = YES; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - MTL_FAST_MATH = YES; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - DW0000090000000000000002 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = NO; - MTL_FAST_MATH = YES; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - DW0000090000000000000003 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DWPoseDemo/Info.plist; - INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time pose estimation."; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.dwpose"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - DW0000090000000000000004 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DWPoseDemo/Info.plist; - INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time pose estimation."; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.dwpose"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; -/* End XCBuildConfiguration section */ - -/* Begin XCConfigurationList section */ - DW0000070000000000000001 /* Build configuration list for PBXNativeTarget "DWPoseDemo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DW0000090000000000000003 /* Debug */, - DW0000090000000000000004 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - DW0000070000000000000003 /* Build configuration list for PBXProject "DWPoseDemo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DW0000090000000000000001 /* Debug */, - DW0000090000000000000002 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; -/* End XCConfigurationList section */ - - }; - rootObject = DW0000080000000000000001 /* Project object */; -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json deleted file mode 100644 index eb87897..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/AccentColor.colorset/Contents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "colors" : [ - { - "idiom" : "universal" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json b/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json deleted file mode 100644 index 73c0059..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo/Assets.xcassets/Contents.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift b/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift deleted file mode 100644 index 817df8a..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo/ContentView.swift +++ /dev/null @@ -1,659 +0,0 @@ -import SwiftUI -import UIKit -import AVFoundation -import CoreML -import Accelerate - -// MARK: - COCO Keypoint Definitions - -let keypointNames: [String] = [ - "nose", // 0 - "left_eye", // 1 - "right_eye", // 2 - "left_ear", // 3 - "right_ear", // 4 - "left_shoulder", // 5 - "right_shoulder", // 6 - "left_elbow", // 7 - "right_elbow", // 8 - "left_wrist", // 9 - "right_wrist", // 10 - "left_hip", // 11 - "right_hip", // 12 - "left_knee", // 13 - "right_knee", // 14 - "left_ankle", // 15 - "right_ankle", // 16 -] - -let skeletonConnections: [(Int, Int)] = [ - (0, 1), (0, 2), (1, 3), (2, 4), // Head - (5, 6), // Shoulders - (5, 7), (7, 9), // Left arm - (6, 8), (8, 10), // Right arm - (5, 11), (6, 12), // Torso - (11, 12), // Hips - (11, 13), (13, 15), // Left leg - (12, 14), (14, 16), // Right leg -] - -// Left-side keypoint indices (blue) -let leftIndices: Set = [1, 3, 5, 7, 9, 11, 13, 15] -// Right-side keypoint indices (red) -let rightIndices: Set = [2, 4, 6, 8, 10, 12, 14, 16] -// Center keypoint indices (green) -let centerIndices: Set = [0] - -// MARK: - Keypoint - -struct Keypoint { - let x: CGFloat - let y: CGFloat - let confidence: Float -} - -// MARK: - Connection Color Helper - -func connectionColor(for connection: (Int, Int)) -> Color { - let (a, b) = connection - // Shoulder-to-shoulder and hip-to-hip are center connections - if (a == 5 && b == 6) || (a == 11 && b == 12) { - return .green - } - // Torso connections use the side of the limb endpoint - if leftIndices.contains(a) || leftIndices.contains(b) { - return .blue - } - if rightIndices.contains(a) || rightIndices.contains(b) { - return .red - } - return .green -} - -func keypointColor(for index: Int) -> Color { - if leftIndices.contains(index) { return .blue } - if rightIndices.contains(index) { return .red } - return .green -} - -// MARK: - Camera Manager - -class CameraManager: NSObject, ObservableObject { - let session = AVCaptureSession() - var onFrame: ((CMSampleBuffer) -> Void)? - - private let sessionQueue = DispatchQueue(label: "camera.session") - - func configure() { - sessionQueue.async { [weak self] in - self?.setupSession() - } - } - - private func setupSession() { - session.beginConfiguration() - session.sessionPreset = .high - - guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), - let input = try? AVCaptureDeviceInput(device: device) else { - session.commitConfiguration() - return - } - - if session.canAddInput(input) { - session.addInput(input) - } - - let output = AVCaptureVideoDataOutput() - output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) - output.alwaysDiscardsLateVideoFrames = true - - if session.canAddOutput(output) { - session.addOutput(output) - } - - session.commitConfiguration() - session.startRunning() - } - - func stop() { - sessionQueue.async { [weak self] in - self?.session.stopRunning() - } - } -} - -extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { - func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - onFrame?(sampleBuffer) - } -} - -// MARK: - Camera Preview - -struct CameraPreview: UIViewRepresentable { - let session: AVCaptureSession - - func makeUIView(context: Context) -> UIView { - let view = UIView(frame: .zero) - let previewLayer = AVCaptureVideoPreviewLayer(session: session) - previewLayer.videoGravity = .resizeAspectFill - view.layer.addSublayer(previewLayer) - context.coordinator.previewLayer = previewLayer - return view - } - - func updateUIView(_ uiView: UIView, context: Context) { - context.coordinator.previewLayer?.frame = uiView.bounds - } - - func makeCoordinator() -> Coordinator { - Coordinator() - } - - class Coordinator { - var previewLayer: AVCaptureVideoPreviewLayer? - } -} - -// MARK: - Pose Estimator - -class PoseEstimator: ObservableObject { - @Published var keypoints: [Keypoint] = [] - @Published var fps: Double = 0 - @Published var detectedKeypointCount: Int = 0 - @Published var errorMessage: String? - - private var mlModel: MLModel? - private var isProcessing = false - private var lastTimestamp: CFTimeInterval = 0 - private var frameCount: Int = 0 - private let fpsUpdateInterval: CFTimeInterval = 0.5 - - private let confidenceThreshold: Float = 0.3 - private let smoothingFactor: CGFloat = 0.6 - private var previousKeypoints: [Keypoint] = [] - - // Model input dimensions - private let inputWidth = 192 - private let inputHeight = 256 - - // SimCC output dimensions (typically 2x input + some margin) - // For RTMPose with SimCC: x_simcc has shape (1, 17, 384), y_simcc has shape (1, 17, 512) - private let simccXSize = 384 // inputWidth * 2 - private let simccYSize = 512 // inputHeight * 2 - - init() { - loadModel() - } - - private func loadModel() { - // PLACEHOLDER: Add DWPose.mlpackage to the Xcode project. - // The compiled .mlmodelc will be bundled automatically. - // Convert using: python conversion_scripts/convert_dwpose.py - // Then drag DWPose.mlpackage into Xcode. - - guard let modelURL = Bundle.main.url(forResource: "DWPose", withExtension: "mlmodelc") else { - DispatchQueue.main.async { - self.errorMessage = "Model not found. Please add DWPose.mlpackage to the Xcode project." - } - return - } - - do { - let config = MLModelConfiguration() - config.computeUnits = .all - mlModel = try MLModel(contentsOf: modelURL, configuration: config) - } catch { - DispatchQueue.main.async { - self.errorMessage = "Failed to load model: \(error.localizedDescription)" - } - } - } - - func estimatePose(sampleBuffer: CMSampleBuffer) { - guard !isProcessing, let model = mlModel else { return } - guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } - isProcessing = true - - // Update FPS counter - let now = CACurrentMediaTime() - frameCount += 1 - if now - lastTimestamp >= fpsUpdateInterval { - let currentFPS = Double(frameCount) / (now - lastTimestamp) - frameCount = 0 - lastTimestamp = now - DispatchQueue.main.async { - self.fps = currentFPS - } - } - - // Preprocess: resize pixel buffer and create MLMultiArray input - guard let resizedBuffer = resizePixelBuffer(pixelBuffer, width: inputWidth, height: inputHeight) else { - isProcessing = false - return - } - - do { - let input = try createModelInput(from: resizedBuffer) - let output = try model.prediction(from: input) - let keypoints = postProcessSimCC(output: output) - - // Apply temporal smoothing - let smoothed = applySmoothingFilter(keypoints) - - let detected = smoothed.filter { $0.confidence >= confidenceThreshold }.count - DispatchQueue.main.async { - self.keypoints = smoothed - self.detectedKeypointCount = detected - self.previousKeypoints = smoothed - } - } catch { - // Silently skip frames with errors during inference - } - - isProcessing = false - } - - private func resizePixelBuffer(_ pixelBuffer: CVPixelBuffer, width: Int, height: Int) -> CVPixelBuffer? { - var resizedBuffer: CVPixelBuffer? - let attrs = [ - kCVPixelBufferCGImageCompatibilityKey: true, - kCVPixelBufferCGBitmapContextCompatibilityKey: true - ] as CFDictionary - - let status = CVPixelBufferCreate( - kCFAllocatorDefault, - width, height, - kCVPixelFormatType_32BGRA, - attrs, - &resizedBuffer - ) - guard status == kCVReturnSuccess, let outputBuffer = resizedBuffer else { return nil } - - CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) - CVPixelBufferLockBaseAddress(outputBuffer, []) - - guard let srcData = CVPixelBufferGetBaseAddress(pixelBuffer), - let dstData = CVPixelBufferGetBaseAddress(outputBuffer) else { - CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) - CVPixelBufferUnlockBaseAddress(outputBuffer, []) - return nil - } - - var srcBuffer = vImage_Buffer( - data: srcData, - height: vImagePixelCount(CVPixelBufferGetHeight(pixelBuffer)), - width: vImagePixelCount(CVPixelBufferGetWidth(pixelBuffer)), - rowBytes: CVPixelBufferGetBytesPerRow(pixelBuffer) - ) - var dstBuffer = vImage_Buffer( - data: dstData, - height: vImagePixelCount(height), - width: vImagePixelCount(width), - rowBytes: CVPixelBufferGetBytesPerRow(outputBuffer) - ) - - vImageScale_ARGB8888(&srcBuffer, &dstBuffer, nil, vImage_Flags(kvImageHighQualityResampling)) - - CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) - CVPixelBufferUnlockBaseAddress(outputBuffer, []) - - return outputBuffer - } - - private func createModelInput(from pixelBuffer: CVPixelBuffer) throws -> MLDictionaryFeatureProvider { - // Create MLMultiArray with shape (1, 3, 256, 192) - let shape: [NSNumber] = [1, 3, NSNumber(value: inputHeight), NSNumber(value: inputWidth)] - let inputArray = try MLMultiArray(shape: shape, dataType: .float32) - - CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly) - defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) } - - guard let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer) else { - throw NSError(domain: "DWPose", code: -1, userInfo: [NSLocalizedDescriptionKey: "Cannot access pixel buffer"]) - } - - let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer) - let ptr = baseAddress.assumingMemoryBound(to: UInt8.self) - - // ImageNet normalization: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] - let mean: [Float] = [0.485, 0.456, 0.406] - let std: [Float] = [0.229, 0.224, 0.225] - - let channelStride = inputHeight * inputWidth - for y in 0.. [Keypoint] { - // RTMPose SimCC outputs: simcc_x (1, 17, simccXSize) and simcc_y (1, 17, simccYSize) - // Each row contains logits for discretized coordinate bins along X or Y axis - // The argmax of each row gives the predicted coordinate - - guard let simccX = output.featureValue(for: "simcc_x")?.multiArrayValue, - let simccY = output.featureValue(for: "simcc_y")?.multiArrayValue else { - // Fallback: try alternative output names - return postProcessHeatmap(output: output) - } - - let numKeypoints = 17 - var keypoints: [Keypoint] = [] - - let xDim = simccX.shape.last?.intValue ?? simccXSize - let yDim = simccY.shape.last?.intValue ?? simccYSize - - for k in 0.. maxXVal { - maxXVal = val - maxXIdx = i - } - } - - // Find argmax and max value for y coordinate - var maxYVal: Float = -Float.greatestFiniteMagnitude - var maxYIdx: Int = 0 - for i in 0.. maxYVal { - maxYVal = val - maxYIdx = i - } - } - - // Convert discretized coordinates back to normalized [0, 1] - let normX = CGFloat(maxXIdx) / CGFloat(xDim) - let normY = CGFloat(maxYIdx) / CGFloat(yDim) - - // Confidence is the average of softmax peaks - let confidence = (maxXVal + maxYVal) / 2.0 - - keypoints.append(Keypoint(x: normX, y: normY, confidence: confidence)) - } - - return keypoints - } - - private func postProcessHeatmap(output: MLFeatureProvider) -> [Keypoint] { - // Fallback heatmap-based post-processing - // Some models output standard heatmaps instead of SimCC - guard let featureNames = output.featureNames.first, - let heatmaps = output.featureValue(for: featureNames)?.multiArrayValue else { - return Array(repeating: Keypoint(x: 0, y: 0, confidence: 0), count: 17) - } - - let numKeypoints = 17 - let heatmapH = heatmaps.shape[2].intValue - let heatmapW = heatmaps.shape[3].intValue - var keypoints: [Keypoint] = [] - - for k in 0.. maxVal { - maxVal = val - maxRow = row - maxCol = col - } - } - } - - let normX = CGFloat(maxCol) / CGFloat(heatmapW) - let normY = CGFloat(maxRow) / CGFloat(heatmapH) - - keypoints.append(Keypoint(x: normX, y: normY, confidence: maxVal)) - } - - return keypoints - } - - private func applySmoothingFilter(_ current: [Keypoint]) -> [Keypoint] { - guard previousKeypoints.count == current.count else { return current } - - return zip(current, previousKeypoints).map { (cur, prev) in - // Only smooth if both frames have sufficient confidence - if cur.confidence >= confidenceThreshold && prev.confidence >= confidenceThreshold { - let smoothX = cur.x * (1.0 - smoothingFactor) + prev.x * smoothingFactor - let smoothY = cur.y * (1.0 - smoothingFactor) + prev.y * smoothingFactor - return Keypoint(x: smoothX, y: smoothY, confidence: cur.confidence) - } - return cur - } - } -} - -// MARK: - Skeleton Overlay - -struct SkeletonOverlay: View { - let keypoints: [Keypoint] - let geometrySize: CGSize - let confidenceThreshold: Float - - var body: some View { - Canvas { context, size in - // Draw skeleton connections - for connection in skeletonConnections { - let (startIdx, endIdx) = connection - guard startIdx < keypoints.count, endIdx < keypoints.count else { continue } - - let startKp = keypoints[startIdx] - let endKp = keypoints[endIdx] - - guard startKp.confidence >= confidenceThreshold, - endKp.confidence >= confidenceThreshold else { continue } - - let startPoint = CGPoint( - x: startKp.x * size.width, - y: startKp.y * size.height - ) - let endPoint = CGPoint( - x: endKp.x * size.width, - y: endKp.y * size.height - ) - - var path = Path() - path.move(to: startPoint) - path.addLine(to: endPoint) - - let color = connectionColor(for: connection) - context.stroke(path, with: .color(color), lineWidth: 3.0) - } - - // Draw keypoint dots - for (index, kp) in keypoints.enumerated() { - guard kp.confidence >= confidenceThreshold else { continue } - - let point = CGPoint( - x: kp.x * size.width, - y: kp.y * size.height - ) - - let dotSize: CGFloat = 8.0 - let rect = CGRect( - x: point.x - dotSize / 2, - y: point.y - dotSize / 2, - width: dotSize, - height: dotSize - ) - - let color = keypointColor(for: index) - - // White border - let borderRect = CGRect( - x: point.x - (dotSize + 2) / 2, - y: point.y - (dotSize + 2) / 2, - width: dotSize + 2, - height: dotSize + 2 - ) - context.fill(Path(ellipseIn: borderRect), with: .color(.white)) - context.fill(Path(ellipseIn: rect), with: .color(color)) - } - } - } -} - -// MARK: - FPS Counter View - -struct FPSCounterView: View { - let fps: Double - - var body: some View { - HStack(spacing: 4) { - Circle() - .fill(fps > 20 ? Color.green : (fps > 10 ? Color.yellow : Color.red)) - .frame(width: 8, height: 8) - Text(String(format: "%.1f FPS", fps)) - .font(.system(size: 13, weight: .bold, design: .monospaced)) - .foregroundColor(.white) - } - .padding(.horizontal, 10) - .padding(.vertical, 5) - .background(Color.black.opacity(0.6)) - .cornerRadius(8) - } -} - -// MARK: - Keypoint Count Badge - -struct KeypointCountBadge: View { - let count: Int - - var body: some View { - HStack(spacing: 4) { - Image(systemName: "figure.stand") - .font(.system(size: 11)) - .foregroundColor(.white) - Text("\(count)/17 keypoints") - .font(.system(size: 13, weight: .bold, design: .monospaced)) - .foregroundColor(.white) - } - .padding(.horizontal, 10) - .padding(.vertical, 5) - .background(Color.black.opacity(0.6)) - .cornerRadius(8) - } -} - -// MARK: - Content View - -struct ContentView: View { - @StateObject private var camera = CameraManager() - @StateObject private var estimator = PoseEstimator() - - var body: some View { - ZStack { - // Camera feed - CameraPreview(session: camera.session) - .ignoresSafeArea() - - // Skeleton overlay - GeometryReader { geometry in - SkeletonOverlay( - keypoints: estimator.keypoints, - geometrySize: geometry.size, - confidenceThreshold: 0.3 - ) - } - .ignoresSafeArea() - - VStack { - // Top bar: FPS and keypoint count - HStack { - FPSCounterView(fps: estimator.fps) - Spacer() - KeypointCountBadge(count: estimator.detectedKeypointCount) - } - .padding(.horizontal, 16) - .padding(.top, 8) - - Spacer() - - // Error message if model not loaded - if let error = estimator.errorMessage { - VStack(spacing: 8) { - Image(systemName: "exclamationmark.triangle.fill") - .font(.largeTitle) - .foregroundColor(.yellow) - Text(error) - .font(.caption) - .multilineTextAlignment(.center) - .padding(.horizontal) - } - .padding() - .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) - .padding() - } - - // Legend at the bottom - HStack(spacing: 16) { - LegendItem(color: .blue, label: "Left") - LegendItem(color: .red, label: "Right") - LegendItem(color: .green, label: "Center") - } - .padding(.horizontal, 16) - .padding(.vertical, 8) - .background(Color.black.opacity(0.6)) - .cornerRadius(12) - .padding(.bottom, 8) - } - } - .onAppear { - camera.onFrame = { [weak estimator] buffer in - estimator?.estimatePose(sampleBuffer: buffer) - } - camera.configure() - } - .onDisappear { - camera.stop() - } - } -} - -// MARK: - Legend Item - -struct LegendItem: View { - let color: Color - let label: String - - var body: some View { - HStack(spacing: 4) { - Circle() - .fill(color) - .frame(width: 10, height: 10) - Text(label) - .font(.system(size: 12, weight: .medium)) - .foregroundColor(.white) - } - } -} - -// MARK: - Preview - -#Preview { - ContentView() -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift b/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift deleted file mode 100644 index 0cf97fa..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo/DWPoseDemoApp.swift +++ /dev/null @@ -1,10 +0,0 @@ -import SwiftUI - -@main -struct DWPoseDemoApp: App { - var body: some Scene { - WindowGroup { - ContentView() - } - } -} diff --git a/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist b/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist deleted file mode 100644 index ae3c071..0000000 --- a/sample_apps/DWPoseDemo/DWPoseDemo/Info.plist +++ /dev/null @@ -1,8 +0,0 @@ - - - - - NSCameraUsageDescription - This app needs camera access for real-time pose estimation. - - diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj deleted file mode 100644 index cb0aa1f..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo.xcodeproj/project.pbxproj +++ /dev/null @@ -1,344 +0,0 @@ -// !$*UTF8*$! -{ - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { - -/* Begin PBXBuildFile section */ - DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */; }; - DA0000010000000000000002 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000002 /* ContentView.swift */; }; - DA0000010000000000000004 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = DA0000020000000000000004 /* Assets.xcassets */; }; - DAML02 /* DepthAnythingV2Small.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = DAML01 /* DepthAnythingV2Small.mlpackage */; }; -/* End PBXBuildFile section */ - -/* Begin PBXFileReference section */ - DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthAnythingV2DemoApp.swift; sourceTree = ""; }; - DA0000020000000000000002 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - DA0000020000000000000004 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - DA0000020000000000000005 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; - DA0000020000000000000010 /* DepthAnythingV2Demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DepthAnythingV2Demo.app; sourceTree = BUILT_PRODUCTS_DIR; }; - DAML01 /* DepthAnythingV2Small.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = DepthAnythingV2Small.mlpackage; sourceTree = ""; }; -/* End PBXFileReference section */ - -/* Begin PBXFrameworksBuildPhase section */ - DA0000030000000000000001 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXFrameworksBuildPhase section */ - -/* Begin PBXGroup section */ - DA0000040000000000000001 = { - isa = PBXGroup; - children = ( - DA0000040000000000000002 /* DepthAnythingV2Demo */, - DA0000040000000000000003 /* Products */, - ); - sourceTree = ""; - }; - DA0000040000000000000002 /* DepthAnythingV2Demo */ = { - isa = PBXGroup; - children = ( - DA0000020000000000000001 /* DepthAnythingV2DemoApp.swift */, - DA0000020000000000000002 /* ContentView.swift */, - DA0000020000000000000004 /* Assets.xcassets */, - DA0000020000000000000005 /* Info.plist */, - DAML01 /* DepthAnythingV2Small.mlpackage */, - ); - path = DepthAnythingV2Demo; - sourceTree = ""; - }; - DA0000040000000000000003 /* Products */ = { - isa = PBXGroup; - children = ( - DA0000020000000000000010 /* DepthAnythingV2Demo.app */, - ); - name = Products; - sourceTree = ""; - }; -/* End PBXGroup section */ - -/* Begin PBXNativeTarget section */ - DA0000050000000000000001 /* DepthAnythingV2Demo */ = { - isa = PBXNativeTarget; - buildConfigurationList = DA0000070000000000000001 /* Build configuration list for PBXNativeTarget "DepthAnythingV2Demo" */; - buildPhases = ( - DA0000060000000000000001 /* Sources */, - DA0000030000000000000001 /* Frameworks */, - DA0000060000000000000002 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = DepthAnythingV2Demo; - productName = DepthAnythingV2Demo; - productReference = DA0000020000000000000010 /* DepthAnythingV2Demo.app */; - productType = "com.apple.product-type.application"; - }; -/* End PBXNativeTarget section */ - -/* Begin PBXProject section */ - DA0000080000000000000001 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - DA0000050000000000000001 = { - CreatedOnToolsVersion = 15.0; - }; - }; - }; - buildConfigurationList = DA0000070000000000000003 /* Build configuration list for PBXProject "DepthAnythingV2Demo" */; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = DA0000040000000000000001; - productRefGroup = DA0000040000000000000003 /* Products */; - projectDirPath = ""; - projectRoot = ""; - targets = ( - DA0000050000000000000001 /* DepthAnythingV2Demo */, - ); - }; -/* End PBXProject section */ - -/* Begin PBXResourcesBuildPhase section */ - DA0000060000000000000002 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DA0000010000000000000004 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXResourcesBuildPhase section */ - -/* Begin PBXSourcesBuildPhase section */ - DA0000060000000000000001 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - DA0000010000000000000001 /* DepthAnythingV2DemoApp.swift in Sources */, - DA0000010000000000000002 /* ContentView.swift in Sources */, - DAML02 /* DepthAnythingV2Small.mlpackage in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; -/* End PBXSourcesBuildPhase section */ - -/* Begin XCBuildConfiguration section */ - DA0000090000000000000001 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_DYNAMIC_NO_PIC = NO; - GCC_NO_COMMON_BLOCKS = YES; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - MTL_FAST_MATH = YES; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - DA0000090000000000000002 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - MTL_ENABLE_DEBUG_INFO = NO; - MTL_FAST_MATH = YES; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - DA0000090000000000000003 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DepthAnythingV2Demo/Info.plist; - INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time depth estimation."; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthanythingv2"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - DA0000090000000000000004 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = ""; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = DepthAnythingV2Demo/Info.plist; - INFOPLIST_KEY_NSCameraUsageDescription = "This app needs camera access for real-time depth estimation."; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.coreml-models.depthanythingv2"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; -/* End XCBuildConfiguration section */ - -/* Begin XCConfigurationList section */ - DA0000070000000000000001 /* Build configuration list for PBXNativeTarget "DepthAnythingV2Demo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DA0000090000000000000003 /* Debug */, - DA0000090000000000000004 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - DA0000070000000000000003 /* Build configuration list for PBXProject "DepthAnythingV2Demo" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - DA0000090000000000000001 /* Debug */, - DA0000090000000000000002 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; -/* End XCConfigurationList section */ - - }; - rootObject = DA0000080000000000000001 /* Project object */; -} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json deleted file mode 100644 index eb87897..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/AccentColor.colorset/Contents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "colors" : [ - { - "idiom" : "universal" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json deleted file mode 100644 index 73c0059..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Assets.xcassets/Contents.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift deleted file mode 100644 index 44745b2..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/ContentView.swift +++ /dev/null @@ -1,438 +0,0 @@ -import SwiftUI -import UIKit -import AVFoundation -import CoreML -import Vision -import Accelerate - -// MARK: - Camera Manager - -class CameraManager: NSObject, ObservableObject { - let session = AVCaptureSession() - var onFrame: ((CMSampleBuffer) -> Void)? - - private let sessionQueue = DispatchQueue(label: "camera.session") - - func configure() { - sessionQueue.async { [weak self] in - self?.setupSession() - } - } - - private func setupSession() { - session.beginConfiguration() - session.sessionPreset = .high - - guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back), - let input = try? AVCaptureDeviceInput(device: device) else { - session.commitConfiguration() - return - } - - if session.canAddInput(input) { - session.addInput(input) - } - - let output = AVCaptureVideoDataOutput() - output.setSampleBufferDelegate(self, queue: DispatchQueue(label: "camera.frame")) - output.alwaysDiscardsLateVideoFrames = true - - if session.canAddOutput(output) { - session.addOutput(output) - } - - session.commitConfiguration() - session.startRunning() - } - - func stop() { - sessionQueue.async { [weak self] in - self?.session.stopRunning() - } - } -} - -extension CameraManager: AVCaptureVideoDataOutputSampleBufferDelegate { - func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - onFrame?(sampleBuffer) - } -} - -// MARK: - Camera Preview - -struct CameraPreview: UIViewRepresentable { - let session: AVCaptureSession - - func makeUIView(context: Context) -> UIView { - let view = UIView(frame: .zero) - let previewLayer = AVCaptureVideoPreviewLayer(session: session) - previewLayer.videoGravity = .resizeAspectFill - view.layer.addSublayer(previewLayer) - context.coordinator.previewLayer = previewLayer - return view - } - - func updateUIView(_ uiView: UIView, context: Context) { - context.coordinator.previewLayer?.frame = uiView.bounds - } - - func makeCoordinator() -> Coordinator { - Coordinator() - } - - class Coordinator { - var previewLayer: AVCaptureVideoPreviewLayer? - } -} - -// MARK: - Depth Estimator - -class DepthEstimator: ObservableObject { - @Published var depthImage: UIImage? - @Published var errorMessage: String? - @Published var minDepth: Float = 0 - @Published var maxDepth: Float = 0 - - private var vnModel: VNCoreMLModel? - private var isProcessing = false - - /// Width and height of the model output depth map. - private let depthSize = 518 - - init() { - loadModel() - } - - private func loadModel() { - // PLACEHOLDER: Add DepthAnythingV2Small.mlpackage to the Xcode project. - // The compiled .mlmodelc will be bundled automatically. - // Download from the CoreML-Models repository and drag into Xcode. - - guard let modelURL = Bundle.main.url(forResource: "DepthAnythingV2Small", withExtension: "mlmodelc") else { - DispatchQueue.main.async { - self.errorMessage = "Model not found. Please add DepthAnythingV2Small.mlpackage to the Xcode project." - } - return - } - - do { - let config = MLModelConfiguration() - config.computeUnits = .all - let mlModel = try MLModel(contentsOf: modelURL, configuration: config) - vnModel = try VNCoreMLModel(for: mlModel) - } catch { - DispatchQueue.main.async { - self.errorMessage = "Failed to load model: \(error.localizedDescription)" - } - } - } - - func estimateDepth(sampleBuffer: CMSampleBuffer) { - guard !isProcessing, let vnModel = vnModel else { return } - guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } - isProcessing = true - - let request = VNCoreMLRequest(model: vnModel) { [weak self] request, error in - defer { self?.isProcessing = false } - - guard let self = self else { return } - - if let results = request.results as? [VNCoreMLFeatureValueObservation], - let multiArray = results.first?.featureValue.multiArrayValue { - self.processDepthOutput(multiArray: multiArray) - } - } - request.imageCropAndScaleOption = .scaleFill - - let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .right, options: [:]) - try? handler.perform([request]) - } - - private func processDepthOutput(multiArray: MLMultiArray) { - let count = multiArray.count - let size = depthSize - - // Extract raw depth values - var depths = [Float](repeating: 0, count: count) - let ptr = multiArray.dataPointer.bindMemory(to: Float.self, capacity: count) - for i in 0.. 0 else { - return - } - - // Create RGBA pixel data with a color gradient - var pixelData = [UInt8](repeating: 255, count: size * size * 4) - - for i in 0.. (UInt8, UInt8, UInt8) { - // Turbo-inspired colormap: near = warm (red), far = cool (blue) - let t = max(0, min(1, value)) - - let r: Float - let g: Float - let b: Float - - if t < 0.25 { - // Red -> Yellow - let s = t / 0.25 - r = 1.0 - g = s - b = 0.0 - } else if t < 0.5 { - // Yellow -> Green - let s = (t - 0.25) / 0.25 - r = 1.0 - s - g = 1.0 - b = 0.0 - } else if t < 0.75 { - // Green -> Cyan - let s = (t - 0.5) / 0.25 - r = 0.0 - g = 1.0 - b = s - } else { - // Cyan -> Blue - let s = (t - 0.75) / 0.25 - r = 0.0 - g = 1.0 - s - b = 1.0 - } - - return ( - UInt8(r * 255), - UInt8(g * 255), - UInt8(b * 255) - ) - } - - /// Creates a fully opaque version of the depth map for full-screen display. - func opaqueDepthImage() -> UIImage? { - guard let cgImage = depthImage?.cgImage else { return nil } - let width = cgImage.width - let height = cgImage.height - - guard let context = CGContext( - data: nil, - width: width, - height: height, - bitsPerComponent: 8, - bytesPerRow: width * 4, - space: CGColorSpaceCreateDeviceRGB(), - bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue - ) else { return nil } - - // Draw black background then the image on top - context.setFillColor(UIColor.black.cgColor) - context.fill(CGRect(x: 0, y: 0, width: width, height: height)) - context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height)) - - guard let result = context.makeImage() else { return nil } - return UIImage(cgImage: result) - } -} - -// MARK: - Content View - -struct ContentView: View { - @StateObject private var camera = CameraManager() - @StateObject private var depthEstimator = DepthEstimator() - @State private var showFullDepthMap = false - - var body: some View { - ZStack { - // Camera feed (hidden when full depth map is shown) - if !showFullDepthMap { - CameraPreview(session: camera.session) - .ignoresSafeArea() - } else { - Color.black - .ignoresSafeArea() - } - - // Depth map overlay or full-screen depth map - if let depthImg = depthEstimator.depthImage { - if showFullDepthMap { - if let opaqueImg = depthEstimator.opaqueDepthImage() { - Image(uiImage: opaqueImg) - .resizable() - .aspectRatio(contentMode: .fill) - .ignoresSafeArea() - } - } else { - Image(uiImage: depthImg) - .resizable() - .aspectRatio(contentMode: .fill) - .ignoresSafeArea() - } - } - - // UI controls - VStack { - // Top bar with title and toggle - HStack { - Text("Depth Anything V2") - .font(.headline) - .foregroundColor(.white) - - Spacer() - - Button(action: { - withAnimation(.easeInOut(duration: 0.3)) { - showFullDepthMap.toggle() - } - }) { - HStack(spacing: 4) { - Image(systemName: showFullDepthMap ? "camera.fill" : "square.stack.3d.up.fill") - .font(.body) - Text(showFullDepthMap ? "Camera" : "Depth") - .font(.caption) - } - .padding(.horizontal, 12) - .padding(.vertical, 6) - .background(.ultraThinMaterial, in: Capsule()) - .foregroundColor(.white) - } - } - .padding(.horizontal) - .padding(.top, 8) - - Spacer() - - // Error message if model not loaded - if let error = depthEstimator.errorMessage { - VStack(spacing: 8) { - Image(systemName: "exclamationmark.triangle.fill") - .font(.largeTitle) - .foregroundColor(.yellow) - Text(error) - .font(.caption) - .multilineTextAlignment(.center) - .padding(.horizontal) - } - .padding() - .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 16)) - .padding() - } - - // Depth info overlay - if depthEstimator.depthImage != nil { - VStack(spacing: 8) { - // Color legend - HStack(spacing: 0) { - Text("Near") - .font(.caption2) - .foregroundColor(.white) - Spacer() - - // Gradient bar - LinearGradient( - gradient: Gradient(colors: [.red, .yellow, .green, .cyan, .blue]), - startPoint: .leading, - endPoint: .trailing - ) - .frame(height: 8) - .cornerRadius(4) - .padding(.horizontal, 8) - - Spacer() - Text("Far") - .font(.caption2) - .foregroundColor(.white) - } - - // Depth statistics - HStack { - Label { - Text(String(format: "Min: %.2f", depthEstimator.minDepth)) - .font(.system(.caption2, design: .monospaced)) - } icon: { - Image(systemName: "arrow.down.circle.fill") - .foregroundColor(.red) - .font(.caption2) - } - - Spacer() - - Label { - Text(String(format: "Max: %.2f", depthEstimator.maxDepth)) - .font(.system(.caption2, design: .monospaced)) - } icon: { - Image(systemName: "arrow.up.circle.fill") - .foregroundColor(.blue) - .font(.caption2) - } - } - .foregroundColor(.white) - } - .padding() - .background(.black.opacity(0.7), in: RoundedRectangle(cornerRadius: 16)) - .padding() - } - } - } - .onAppear { - camera.onFrame = { [weak depthEstimator] buffer in - depthEstimator?.estimateDepth(sampleBuffer: buffer) - } - camera.configure() - } - .onDisappear { - camera.stop() - } - } -} - -#Preview { - ContentView() -} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift deleted file mode 100644 index c245af6..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/DepthAnythingV2DemoApp.swift +++ /dev/null @@ -1,10 +0,0 @@ -import SwiftUI - -@main -struct DepthAnythingV2DemoApp: App { - var body: some Scene { - WindowGroup { - ContentView() - } - } -} diff --git a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist b/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist deleted file mode 100644 index 7fac6e7..0000000 --- a/sample_apps/DepthAnythingV2Demo/DepthAnythingV2Demo/Info.plist +++ /dev/null @@ -1,8 +0,0 @@ - - - - - NSCameraUsageDescription - This app needs camera access for real-time depth estimation. - - diff --git a/creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AppIcon.appiconset/Contents.json similarity index 100% rename from creative_apps/WhisperDemo/WhisperDemo/Assets.xcassets/AppIcon.appiconset/Contents.json rename to sample_apps/YOLOv10Demo/YOLOv10Demo/Assets.xcassets/AppIcon.appiconset/Contents.json diff --git a/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift b/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift index f16a8c4..2787d22 100644 --- a/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift +++ b/sample_apps/YOLOv10Demo/YOLOv10Demo/ContentView.swift @@ -200,8 +200,9 @@ class ObjectDetector: ObservableObject { defer { self?.isProcessing = false } guard let self = self else { return } - if let results = request.results as? [VNRecognizedObjectObservation] { - self.processRecognizedObjects(results) + if let results = request.results as? [VNCoreMLFeatureValueObservation], + let multiArray = results.first?.featureValue.multiArrayValue { + self.processRawOutput(multiArray) } else { DispatchQueue.main.async { self.detections = [] @@ -214,21 +215,36 @@ class ObjectDetector: ObservableObject { try? handler.perform([request]) } - private func processRecognizedObjects(_ observations: [VNRecognizedObjectObservation]) { - let filtered = observations.filter { $0.confidence >= confidenceThreshold } + /// Parse raw YOLOv10 output [1, 300, 6] where each row is [x1, y1, x2, y2, confidence, class_id]. + private func processRawOutput(_ multiArray: MLMultiArray) { + let numDetections = multiArray.shape[1].intValue // 300 + let stride = multiArray.shape[2].intValue // 6 + let ptr = multiArray.dataPointer.bindMemory(to: Float.self, capacity: numDetections * stride) - let results: [Detection] = filtered.compactMap { observation in - guard let topLabel = observation.labels.first else { return nil } + var results: [Detection] = [] - // Attempt to find COCO class index from label identifier - let classIndex = cocoClassLabels.first(where: { $0.value == topLabel.identifier })?.key ?? 0 + for i in 0..= confidenceThreshold else { continue } - return Detection( - classIndex: classIndex, - label: topLabel.identifier, - confidence: topLabel.confidence, - boundingBox: observation.boundingBox - ) + let x1 = CGFloat(ptr[base]) / 640.0 + let y1 = CGFloat(ptr[base + 1]) / 640.0 + let x2 = CGFloat(ptr[base + 2]) / 640.0 + let y2 = CGFloat(ptr[base + 3]) / 640.0 + let classId = Int(ptr[base + 5]) + + let label = cocoClassLabels[classId] ?? "class_\(classId)" + + // Convert from top-left origin to Vision convention (bottom-left origin) + let box = CGRect(x: x1, y: 1 - y2, width: x2 - x1, height: y2 - y1) + + results.append(Detection( + classIndex: classId, + label: label, + confidence: confidence, + boundingBox: box + )) } DispatchQueue.main.async { From cf93a3f65b56eef9fee249f1bba87e950f99c01e Mon Sep 17 00:00:00 2001 From: john-rocky Date: Wed, 1 Apr 2026 17:46:09 +0900 Subject: [PATCH 15/18] Implement HTDemucs source separation pipeline with STFT/iSTFT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Accelerate-based STFT/iSTFT signal processing (vDSP FFT) - Implement stride-aware MLMultiArray extraction for Float16 outputs - Use time-domain output only (freq branch overflows Float16 → ±inf) - Audio loading with format conversion and resampling to 44.1kHz stereo - Per-stem WAV export and playback - Fix SwiftUI type-checker timeout in WaveformView Known issue: freq_output produces ±inf due to Float16 overflow in the model's frequency branch. Reconverting the model with Float32 outputs should enable freq+time reconstruction for better separation quality. --- .../DemucsDemo.xcodeproj/project.pbxproj | 37 +- .../DemucsDemo/DemucsDemo/ContentView.swift | 534 +++++++++++++++--- 2 files changed, 483 insertions(+), 88 deletions(-) diff --git a/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj index 3761e41..7f7cacc 100644 --- a/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj +++ b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj @@ -7,17 +7,17 @@ objects = { /* Begin PBXBuildFile section */ - B10001 /* DemucsDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10002; }; - B10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10004; }; - B10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B10006; }; - B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B1DM01; }; + B10001 /* DemucsDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10002 /* DemucsDemoApp.swift */; }; + B10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10004 /* ContentView.swift */; }; + B10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B10006 /* Assets.xcassets */; }; + B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B1DM01 /* HTDemucs_SourceSeparation.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - B10007 /* DemucsDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DemucsDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; B10002 /* DemucsDemoApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DemucsDemoApp.swift; sourceTree = ""; }; B10004 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; B10006 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + B10007 /* DemucsDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DemucsDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; B10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; B1DM01 /* HTDemucs_SourceSeparation.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = HTDemucs_SourceSeparation.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ @@ -66,7 +66,7 @@ /* Begin PBXNativeTarget section */ B10013 /* DemucsDemo */ = { isa = PBXNativeTarget; - buildConfigurationList = B10014; + buildConfigurationList = B10014 /* Build configuration list for PBXNativeTarget "DemucsDemo" */; buildPhases = ( B10015 /* Sources */, B10009 /* Frameworks */, @@ -78,7 +78,7 @@ ); name = DemucsDemo; productName = DemucsDemo; - productReference = B10007; + productReference = B10007 /* DemucsDemo.app */; productType = "com.apple.product-type.application"; }; /* End PBXNativeTarget section */ @@ -96,7 +96,7 @@ }; }; }; - buildConfigurationList = B10018; + buildConfigurationList = B10018 /* Build configuration list for PBXProject "DemucsDemo" */; compatibilityVersion = "Xcode 14.0"; developmentRegion = en; hasScannedForEncodings = 0; @@ -105,11 +105,11 @@ Base, ); mainGroup = B10010; - productRefGroup = B10012; + productRefGroup = B10012 /* Products */; projectDirPath = ""; projectRoot = ""; targets = ( - B10013, + B10013 /* DemucsDemo */, ); }; /* End PBXProject section */ @@ -199,6 +199,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = DemucsDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -226,6 +227,7 @@ ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = MFN25KNUGJ; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = DemucsDemo/Info.plist; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -249,26 +251,25 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - B10018 /* Build configuration list for PBXProject */ = { + B10014 /* Build configuration list for PBXNativeTarget "DemucsDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - B10019, - B10020, + B10021 /* Debug */, + B10022 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - B10014 /* Build configuration list for PBXNativeTarget */ = { + B10018 /* Build configuration list for PBXProject "DemucsDemo" */ = { isa = XCConfigurationList; buildConfigurations = ( - B10021, - B10022, + B10019 /* Debug */, + B10020 /* Release */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; - rootObject = B10017; + rootObject = B10017 /* Project object */; } diff --git a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift index 7d11bf6..f0dad00 100644 --- a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift +++ b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift @@ -3,28 +3,28 @@ import UIKit import CoreML import AVFoundation import UniformTypeIdentifiers +import Accelerate // MARK: - HTDemucs Audio Source Separation Demo -// -// HTDemucs separates audio into 4 stems: Vocals, Drums, Bass, Other. -// -// IMPORTANT: The model operates in the frequency domain. -// In a production app, you must perform STFT (Short-Time Fourier Transform) on the input -// audio to produce the freq_input (1,8,2049,336) tensor, and also provide the raw -// time_input (1,2,343980) waveform. After inference, the frequency and time domain -// outputs must be combined via iSTFT (Inverse STFT) to reconstruct each stem's waveform. -// -// This demo uses simplified/placeholder audio processing to demonstrate the UI flow. -// A full implementation would require an STFT library (e.g., Accelerate vDSP). enum Stem: String, CaseIterable, Identifiable { - case vocals = "Vocals" case drums = "Drums" case bass = "Bass" + case vocals = "Vocals" case other = "Other" var id: String { rawValue } + // Index in model output (freq_output: 4 channels per stem, time_output: 2 channels per stem) + var modelIndex: Int { + switch self { + case .drums: return 0 + case .bass: return 1 + case .vocals: return 2 + case .other: return 3 + } + } + var icon: String { switch self { case .vocals: return "mic.fill" @@ -164,7 +164,7 @@ struct ContentView: View { Spacer() - // Waveform visualization placeholder + // Waveform visualization if viewModel.isSeparated { WaveformView(activeStem: viewModel.playingStem) .frame(height: 80) @@ -206,7 +206,6 @@ struct StemPlayerView: View { Spacer() - // Volume indicator HStack(spacing: 2) { ForEach(0..<5) { i in RoundedRectangle(cornerRadius: 1) @@ -216,11 +215,7 @@ struct StemPlayerView: View { } Button(action: { - if isPlaying { - onStop() - } else { - onPlay() - } + if isPlaying { onStop() } else { onPlay() } }) { Image(systemName: isPlaying ? "stop.circle.fill" : "play.circle.fill") .font(.title) @@ -243,24 +238,33 @@ struct WaveformView: View { var body: some View { TimelineView(.animation) { timeline in - Canvas { context, size in - let color = activeStem?.color ?? .gray - let midY = size.height / 2 - let amplitude = activeStem != nil ? size.height * 0.35 : size.height * 0.1 - let time = timeline.date.timeIntervalSinceReferenceDate - - var path = Path() - path.move(to: CGPoint(x: 0, y: midY)) - for x in stride(from: 0, through: size.width, by: 2) { - let normalizedX = x / size.width - let y = midY + sin(normalizedX * .pi * 6 + time * 3) * amplitude * - (0.5 + 0.5 * sin(normalizedX * .pi * 2 + time * 1.5)) - path.addLine(to: CGPoint(x: x, y: y)) - } + waveformCanvas(time: timeline.date.timeIntervalSinceReferenceDate) + } + } - context.stroke(path, with: .color(color.opacity(0.7)), lineWidth: 2) - } + private func waveformCanvas(time: Double) -> some View { + let color: Color = activeStem?.color ?? .gray + let isActive: Bool = activeStem != nil + return Canvas { context, size in + drawWaveform(context: context, size: size, time: time, color: color, isActive: isActive) + } + } + + private func drawWaveform(context: GraphicsContext, size: CGSize, time: Double, color: Color, isActive: Bool) { + let midY: CGFloat = size.height / 2 + let amplitude: CGFloat = isActive ? size.height * 0.35 : size.height * 0.1 + + var path = Path() + path.move(to: CGPoint(x: 0, y: midY)) + for x in stride(from: 0, through: size.width, by: 2) { + let normalizedX: CGFloat = x / size.width + let wave1: CGFloat = sin(normalizedX * .pi * 6 + time * 3) + let wave2: CGFloat = sin(normalizedX * .pi * 2 + time * 1.5) + let y: CGFloat = midY + wave1 * amplitude * (0.5 + 0.5 * wave2) + path.addLine(to: CGPoint(x: x, y: y)) } + + context.stroke(path, with: .color(color.opacity(0.7)), lineWidth: 2) } } @@ -302,6 +306,384 @@ struct AudioFilePickerView: UIViewControllerRepresentable { } } +// MARK: - STFT / iSTFT Signal Processing + +private enum DSP { + static let fftSize = 4096 + static let hopSize = 1024 + static let numBins = 2048 // fftSize / 2 + static let numFrames = 336 + static let segmentLength = 343980 + static let segmentOffset = 0 // Set > 0 to skip intro (e.g., 343980 = skip first ~7.8s) + static let sampleRate: Double = 44100 + + // Periodic Hann window (matches PyTorch's hann_window with periodic=True) + static let window: [Float] = (0.. [Float] { + let n = signal.count + var padded = [Float](repeating: 0, count: n + 2 * padSize) + // Left: signal[padSize], signal[padSize-1], ..., signal[1] + for i in 0...size) + } + } + // Right: signal[n-2], signal[n-3], ..., signal[n-1-padSize] + for i in 0.. (left: [Float], right: [Float]) { + _ = url.startAccessingSecurityScopedResource() + defer { url.stopAccessingSecurityScopedResource() } + + let sourceFile = try AVAudioFile(forReading: url) + let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: sampleRate, + channels: 2, + interleaved: false + )! + + let totalNeeded = segmentOffset + segmentLength + guard let outputBuffer = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: AVAudioFrameCount(totalNeeded)) else { + throw DemucsError.processingFailed("Failed to create audio buffer") + } + + if sourceFile.processingFormat.sampleRate == sampleRate && sourceFile.processingFormat.channelCount == 2 { + let count = min(AVAudioFrameCount(sourceFile.length), AVAudioFrameCount(totalNeeded)) + try sourceFile.read(into: outputBuffer, frameCount: count) + } else { + guard let converter = AVAudioConverter(from: sourceFile.processingFormat, to: targetFormat) else { + throw DemucsError.processingFailed("Cannot convert audio format") + } + let srcBuffer = AVAudioPCMBuffer(pcmFormat: sourceFile.processingFormat, frameCapacity: AVAudioFrameCount(sourceFile.length))! + try sourceFile.read(into: srcBuffer) + var error: NSError? + converter.convert(to: outputBuffer, error: &error) { _, outStatus in + outStatus.pointee = .haveData + return srcBuffer + } + if let error { throw error } + } + + let count = Int(outputBuffer.frameLength) + let leftPtr = outputBuffer.floatChannelData![0] + let rightCh = outputBuffer.format.channelCount > 1 ? 1 : 0 + let rightPtr = outputBuffer.floatChannelData![rightCh] + + // Skip segmentOffset samples, take segmentLength samples + let offset = min(segmentOffset, max(0, count - segmentLength)) + let available = min(segmentLength, count - offset) + var left = Array(UnsafeBufferPointer(start: leftPtr + offset, count: available)) + var right = Array(UnsafeBufferPointer(start: rightPtr + offset, count: available)) + + // Pad or trim to segment length + if left.count < segmentLength { + left.append(contentsOf: [Float](repeating: 0, count: segmentLength - left.count)) + right.append(contentsOf: [Float](repeating: 0, count: segmentLength - right.count)) + } else if left.count > segmentLength { + left = Array(left.prefix(segmentLength)) + right = Array(right.prefix(segmentLength)) + } + + return (left, right) + } + + /// Forward STFT using vDSP. + /// Returns (real, imag) arrays of size [numBins * numFrames], bin-major order. + /// Values are true (unscaled) DFT coefficients matching PyTorch's torch.stft output. + static func forwardSTFT(signal: [Float]) -> (real: [Float], imag: [Float]) { + let log2n = vDSP_Length(log2(Float(fftSize))) + guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return ([], []) } + defer { vDSP_destroy_fftsetup(fftSetup) } + + let halfN = fftSize / 2 + var allReal = [Float](repeating: 0, count: numBins * numFrames) + var allImag = [Float](repeating: 0, count: numBins * numFrames) + var frame = [Float](repeating: 0, count: fftSize) + var rp = [Float](repeating: 0, count: halfN) + var ip = [Float](repeating: 0, count: halfN) + + for f in 0.. 0 { + signal.withUnsafeBufferPointer { buf in + frame.withUnsafeMutableBufferPointer { dst in + memcpy(dst.baseAddress!, buf.baseAddress! + start, avail * MemoryLayout.size) + } + } + } + + // Apply analysis window + vDSP_vmul(frame, 1, window, 1, &frame, 1, vDSP_Length(fftSize)) + + // Pack as split complex: rp[i] = frame[2i], ip[i] = frame[2i+1] + frame.withUnsafeBufferPointer { src in + src.baseAddress!.withMemoryRebound(to: DSPComplex.self, capacity: halfN) { complexPtr in + rp.withUnsafeMutableBufferPointer { rpBuf in + ip.withUnsafeMutableBufferPointer { ipBuf in + var sc = DSPSplitComplex(realp: rpBuf.baseAddress!, imagp: ipBuf.baseAddress!) + vDSP_ctoz(complexPtr, 2, &sc, 1, vDSP_Length(halfN)) + } + } + } + } + + // Forward FFT (output is 2x true DFT) + rp.withUnsafeMutableBufferPointer { rpBuf in + ip.withUnsafeMutableBufferPointer { ipBuf in + var sc = DSPSplitComplex(realp: rpBuf.baseAddress!, imagp: ipBuf.baseAddress!) + vDSP_fft_zrip(fftSetup, &sc, 1, log2n, FFTDirection(kFFTDirection_Forward)) + } + } + + // Store true DFT values (divide by 2) + // Bin 0 (DC): rp[0]/2, imag = 0 + allReal[f] = rp[0] * 0.5 + allImag[f] = 0 + + // Bins 1..numBins-1 + for k in 1.. [Float] { + let log2n = vDSP_Length(log2(Float(fftSize))) + guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return [] } + defer { vDSP_destroy_fftsetup(fftSetup) } + + let halfN = fftSize / 2 + var output = [Float](repeating: 0, count: outputLength) + var windowSum = [Float](repeating: 0, count: outputLength) + var rp = [Float](repeating: 0, count: halfN) + var ip = [Float](repeating: 0, count: halfN) + var frame = [Float](repeating: 0, count: fftSize) + + for f in 0.. 1e-8 { + output[i] /= windowSum[i] + } + } + + return output + } + + /// Convert MLMultiArray to [Float], handling Float16/Float32 output types + static func mlArrayToFloat32(_ array: MLMultiArray) -> [Float] { + let count = array.count + switch array.dataType { + case .float32: + let ptr = array.dataPointer.bindMemory(to: Float.self, capacity: count) + return Array(UnsafeBufferPointer(start: ptr, count: count)) + case .float16: + var result = [Float](repeating: 0, count: count) + let srcPtr = array.dataPointer + result.withUnsafeMutableBufferPointer { dst in + var srcBuf = vImage_Buffer( + data: UnsafeMutableRawPointer(mutating: srcPtr), + height: 1, + width: vImagePixelCount(count), + rowBytes: count * 2 + ) + var dstBuf = vImage_Buffer( + data: dst.baseAddress!, + height: 1, + width: vImagePixelCount(count), + rowBytes: count * MemoryLayout.size + ) + vImageConvert_Planar16FtoPlanarF(&srcBuf, &dstBuf, 0) + } + return result + default: + return (0.. [Float] { + let strides = array.strides.map { $0.intValue } + let baseOffset = batch * strides[0] + channel * strides[1] + let hStride = strides[2] + let wStride = strides[3] + let count = height * width + var result = [Float](repeating: 0, count: count) + + if array.dataType == .float32 { + let ptr = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + if wStride == 1 { + // Row-contiguous: copy row by row (handles padding between rows) + result.withUnsafeMutableBufferPointer { dst in + for h in 0.. [Float] { + let strides = array.strides.map { $0.intValue } + let baseOffset = batch * strides[0] + channel * strides[1] + let wStride = strides[2] + var result = [Float](repeating: 0, count: width) + + if array.dataType == .float32 { + let ptr = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + if wStride == 1 { + result.withUnsafeMutableBufferPointer { dst in + memcpy(dst.baseAddress!, ptr + baseOffset, width * MemoryLayout.size) + } + } else { + for w in 0.. Date: Wed, 1 Apr 2026 18:07:25 +0900 Subject: [PATCH 16/18] Add HTDemucs CoreML conversion script (Float32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Draft conversion script to reconvert HTDemucs with Float32 precision. The current Float16 model causes overflow (±inf) in the frequency branch. --- conversion_scripts/convert_htdemucs.py | 156 +++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 conversion_scripts/convert_htdemucs.py diff --git a/conversion_scripts/convert_htdemucs.py b/conversion_scripts/convert_htdemucs.py new file mode 100644 index 0000000..6106feb --- /dev/null +++ b/conversion_scripts/convert_htdemucs.py @@ -0,0 +1,156 @@ +# HTDemucs -> CoreML conversion +# pip install torch torchaudio coremltools demucs +# +# Converts the HTDemucs source separation model to CoreML. +# The model separates audio into 4 stems: drums, bass, vocals, other. +# +# Architecture: Hybrid Transformer Demucs +# - Frequency branch: processes STFT (complex as real/imag channels) +# - Time branch: processes raw waveform +# Both branches produce output that should be combined for best quality. +# +# NOTE: Use compute_precision=ct.precision.FLOAT32 to avoid Float16 overflow +# in the frequency branch (intermediate values exceed ±65504). + +import torch +import coremltools as ct + +# Load pretrained HTDemucs +from demucs.pretrained import get_model + +model = get_model("htdemucs") +model.eval() + +# Model parameters +segment_samples = int(model.segment * model.samplerate) # ~343980 samples at 44.1kHz +n_fft = model.nfft # 4096 +hop_length = n_fft // 4 # 1024 +n_freq = n_fft // 2 # 2048 +n_frames = segment_samples // hop_length # 336 + +print(f"segment_samples={segment_samples}, n_fft={n_fft}, hop={hop_length}, n_freq={n_freq}, n_frames={n_frames}") + +# Create dummy inputs matching the model's expected format +# spectral_magnitude: complex STFT as real/imag channels [1, 4, n_freq, n_frames] +# torch.cat([z.real, z.imag], dim=1) where z is [batch, 2, n_freq, n_frames] complex +dummy_spectral = torch.randn(1, 4, n_freq, n_frames) +# audio_waveform: raw stereo waveform [1, 2, segment_samples] +dummy_waveform = torch.randn(1, 2, segment_samples) + +# Trace the model's core forward pass +# HTDemucs forward takes both spectral and waveform inputs +class HTDemucsWrapper(torch.nn.Module): + """Wrapper to expose the hybrid encoder-decoder as a single forward pass.""" + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, spectral_magnitude, audio_waveform): + # The model internally splits spectral into real/imag and processes both branches + # Reconstruct the complex STFT from real/imag channels + B = spectral_magnitude.shape[0] + # spectral_magnitude: [B, 4, F, T] -> [B, 2, F, T] complex + half = spectral_magnitude.shape[1] // 2 # 2 + z_real = spectral_magnitude[:, :half] # [B, 2, F, T] + z_imag = spectral_magnitude[:, half:] # [B, 2, F, T] + z = torch.complex(z_real, z_imag) # [B, 2, F, T] complex + + # Run the model's internal processing + # The model has: frequency encoder, time encoder, cross-attention, decoders + length = audio_waveform.shape[-1] + + # Use model's internal _spec and _ispec methods if available, + # otherwise use the full forward pass + x = audio_waveform + + # Encode frequency branch + freq_encoded = self.model._domain_forward("freq", z) + # Encode time branch + time_encoded = self.model._domain_forward("time", x) + + # Cross-attention and decoder + freq_out, time_out = self.model._decode(freq_encoded, time_encoded, length) + + # freq_out: [B, S*2C, F, T] where S=4 sources, C=2 stereo + # Convert complex output to real/imag channels + # freq_out is complex [B, S, 2, F, T] + freq_real = freq_out.real # [B, sources*2, F, T] + freq_imag = freq_out.imag + freq_output = torch.cat([freq_real, freq_imag], dim=2) # or appropriate dim + + return freq_out, time_out + +# Alternative simpler approach: trace the full model +class HTDemucsSimpleWrapper(torch.nn.Module): + """Simple wrapper that takes spectral + waveform and returns separated stems.""" + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, spectral_magnitude, audio_waveform): + # Reconstruct complex STFT + half = spectral_magnitude.shape[1] // 2 + z_real = spectral_magnitude[:, :half] + z_imag = spectral_magnitude[:, half:] + z = torch.complex(z_real, z_imag) + + length = audio_waveform.shape[-1] + + # Use the model's hybrid forward + # freq_out: complex [B, S, C, F, T], time_out: [B, S, C, T] + freq_out, time_out = self.model.forward_hybrid(z, audio_waveform, length) + + # Convert freq_out complex to real channels + # freq_out shape: [B, 4, 2, n_freq, n_frames] complex + # -> [B, 16, n_freq, n_frames] as real/imag interleaved per source + S = freq_out.shape[1] # 4 sources + C = freq_out.shape[2] # 2 channels + freq_real = freq_out.real.reshape(1, S * C, n_freq, n_frames) + freq_imag = freq_out.imag.reshape(1, S * C, n_freq, n_frames) + freq_result = torch.cat([freq_real, freq_imag], dim=1) # [1, 16, F, T] + + # time_out shape: [B, 4, 2, T] -> [1, 8, T] + time_result = time_out.reshape(1, S * C, -1) + + return freq_result, time_result + +wrapper = HTDemucsSimpleWrapper(model) +wrapper.eval() + +# Try tracing +print("Tracing model...") +with torch.no_grad(): + traced = torch.jit.trace(wrapper, (dummy_spectral, dummy_waveform)) + +print("Converting to CoreML (Float32)...") +mlmodel = ct.convert( + traced, + inputs=[ + ct.TensorType( + name="spectral_magnitude", + shape=(1, 4, n_freq, n_frames), + ), + ct.TensorType( + name="audio_waveform", + shape=(1, 2, segment_samples), + ), + ], + outputs=[ + ct.TensorType(name="freq_output"), + ct.TensorType(name="time_output"), + ], + minimum_deployment_target=ct.target.iOS16, + convert_to="mlprogram", + compute_precision=ct.precision.FLOAT32, # Prevent Float16 overflow +) + +mlmodel.author = "Meta Research (Demucs)" +mlmodel.license = "MIT License" +mlmodel.short_description = ( + "HTDemucs audio source separation - separates audio into 4 stems: " + "drums, bass, vocals, other. Requires app-side STFT/iSTFT processing." +) + +mlmodel.save("HTDemucs_SourceSeparation_F32.mlpackage") +print("Saved HTDemucs_SourceSeparation_F32.mlpackage") +print("Model uses Float32 to prevent frequency branch overflow.") From 7283b97f581680c56c078ed2c16c028cb88be56b Mon Sep 17 00:00:00 2001 From: john-rocky Date: Wed, 1 Apr 2026 18:56:28 +0900 Subject: [PATCH 17/18] Switch to time-only reconstruction, add F32 model conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - freq_output overflows Float16 range (±inf) for real STFT data, even with Float32 internal computation (output tensor is Float16) - Use time-domain output only for stem reconstruction - Add F32 model to Xcode project (compute_precision=FLOAT32) - Simplify conversion script: ONNX-based, end-to-end model - Source order confirmed: drums, bass, other, vocals Known issue: Full freq+time reconstruction requires Float32 output tensors in the CoreML model. The current model spec forces Float16 output which cannot represent large STFT values (>65504). Time-only provides decent separation quality. --- conversion_scripts/convert_htdemucs.py | 170 ++++++------------ .../DemucsDemo.xcodeproj/project.pbxproj | 4 + .../DemucsDemo/DemucsDemo/ContentView.swift | 10 +- 3 files changed, 61 insertions(+), 123 deletions(-) diff --git a/conversion_scripts/convert_htdemucs.py b/conversion_scripts/convert_htdemucs.py index 6106feb..350ffda 100644 --- a/conversion_scripts/convert_htdemucs.py +++ b/conversion_scripts/convert_htdemucs.py @@ -1,156 +1,90 @@ # HTDemucs -> CoreML conversion # pip install torch torchaudio coremltools demucs # -# Converts the HTDemucs source separation model to CoreML. -# The model separates audio into 4 stems: drums, bass, vocals, other. +# The model takes raw stereo audio and outputs 4 separated stems directly. +# All STFT/iSTFT/normalization is handled internally by the model. # -# Architecture: Hybrid Transformer Demucs -# - Frequency branch: processes STFT (complex as real/imag channels) -# - Time branch: processes raw waveform -# Both branches produce output that should be combined for best quality. +# Input: mix [1, 2, 343980] - stereo audio at 44100Hz (~7.8s) +# Output: sources [1, 4, 2, 343980] - 4 stems (drums, bass, other, vocals), stereo # -# NOTE: Use compute_precision=ct.precision.FLOAT32 to avoid Float16 overflow -# in the frequency branch (intermediate values exceed ±65504). +# Uses Float32 to prevent overflow in the frequency branch. import torch import coremltools as ct - -# Load pretrained HTDemucs from demucs.pretrained import get_model -model = get_model("htdemucs") +# Load HTDemucs +bag = get_model("htdemucs") +model = bag.models[0] model.eval() -# Model parameters -segment_samples = int(model.segment * model.samplerate) # ~343980 samples at 44.1kHz -n_fft = model.nfft # 4096 -hop_length = n_fft // 4 # 1024 -n_freq = n_fft // 2 # 2048 -n_frames = segment_samples // hop_length # 336 - -print(f"segment_samples={segment_samples}, n_fft={n_fft}, hop={hop_length}, n_freq={n_freq}, n_frames={n_frames}") - -# Create dummy inputs matching the model's expected format -# spectral_magnitude: complex STFT as real/imag channels [1, 4, n_freq, n_frames] -# torch.cat([z.real, z.imag], dim=1) where z is [batch, 2, n_freq, n_frames] complex -dummy_spectral = torch.randn(1, 4, n_freq, n_frames) -# audio_waveform: raw stereo waveform [1, 2, segment_samples] -dummy_waveform = torch.randn(1, 2, segment_samples) - -# Trace the model's core forward pass -# HTDemucs forward takes both spectral and waveform inputs -class HTDemucsWrapper(torch.nn.Module): - """Wrapper to expose the hybrid encoder-decoder as a single forward pass.""" - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, spectral_magnitude, audio_waveform): - # The model internally splits spectral into real/imag and processes both branches - # Reconstruct the complex STFT from real/imag channels - B = spectral_magnitude.shape[0] - # spectral_magnitude: [B, 4, F, T] -> [B, 2, F, T] complex - half = spectral_magnitude.shape[1] // 2 # 2 - z_real = spectral_magnitude[:, :half] # [B, 2, F, T] - z_imag = spectral_magnitude[:, half:] # [B, 2, F, T] - z = torch.complex(z_real, z_imag) # [B, 2, F, T] complex - - # Run the model's internal processing - # The model has: frequency encoder, time encoder, cross-attention, decoders - length = audio_waveform.shape[-1] - - # Use model's internal _spec and _ispec methods if available, - # otherwise use the full forward pass - x = audio_waveform - - # Encode frequency branch - freq_encoded = self.model._domain_forward("freq", z) - # Encode time branch - time_encoded = self.model._domain_forward("time", x) - - # Cross-attention and decoder - freq_out, time_out = self.model._decode(freq_encoded, time_encoded, length) +segment_samples = int(model.segment * model.samplerate) # 343980 +print(f"sources: {model.sources}") +print(f"segment_samples: {segment_samples}") +print(f"samplerate: {model.samplerate}") - # freq_out: [B, S*2C, F, T] where S=4 sources, C=2 stereo - # Convert complex output to real/imag channels - # freq_out is complex [B, S, 2, F, T] - freq_real = freq_out.real # [B, sources*2, F, T] - freq_imag = freq_out.imag - freq_output = torch.cat([freq_real, freq_imag], dim=2) # or appropriate dim - - return freq_out, time_out - -# Alternative simpler approach: trace the full model -class HTDemucsSimpleWrapper(torch.nn.Module): - """Simple wrapper that takes spectral + waveform and returns separated stems.""" +# Wrapper to flatten output from [1,4,2,T] to [1,8,T] for CoreML compatibility +class HTDemucsExport(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model - def forward(self, spectral_magnitude, audio_waveform): - # Reconstruct complex STFT - half = spectral_magnitude.shape[1] // 2 - z_real = spectral_magnitude[:, :half] - z_imag = spectral_magnitude[:, half:] - z = torch.complex(z_real, z_imag) - - length = audio_waveform.shape[-1] - - # Use the model's hybrid forward - # freq_out: complex [B, S, C, F, T], time_out: [B, S, C, T] - freq_out, time_out = self.model.forward_hybrid(z, audio_waveform, length) - - # Convert freq_out complex to real channels - # freq_out shape: [B, 4, 2, n_freq, n_frames] complex - # -> [B, 16, n_freq, n_frames] as real/imag interleaved per source - S = freq_out.shape[1] # 4 sources - C = freq_out.shape[2] # 2 channels - freq_real = freq_out.real.reshape(1, S * C, n_freq, n_frames) - freq_imag = freq_out.imag.reshape(1, S * C, n_freq, n_frames) - freq_result = torch.cat([freq_real, freq_imag], dim=1) # [1, 16, F, T] - - # time_out shape: [B, 4, 2, T] -> [1, 8, T] - time_result = time_out.reshape(1, S * C, -1) + def forward(self, mix): + # mix: [1, 2, T] + # output: [1, 4, 2, T] -> [1, 8, T] + x = self.model(mix) + B, S, C, T = x.shape + return x.reshape(B, S * C, T) - return freq_result, time_result - -wrapper = HTDemucsSimpleWrapper(model) +wrapper = HTDemucsExport(model) wrapper.eval() -# Try tracing -print("Tracing model...") -with torch.no_grad(): - traced = torch.jit.trace(wrapper, (dummy_spectral, dummy_waveform)) +# Export via ONNX to avoid coremltools int op conversion bug +print("Exporting to ONNX...") +dummy = torch.randn(1, 2, segment_samples) +onnx_path = "HTDemucs_F32.onnx" -print("Converting to CoreML (Float32)...") +with torch.no_grad(): + torch.onnx.export( + wrapper, + dummy, + onnx_path, + input_names=["mix"], + output_names=["sources"], + opset_version=17, + do_constant_folding=True, + ) +print(f"Saved ONNX: {onnx_path}") + +# Convert ONNX to CoreML with Float32 +print("Converting ONNX to CoreML (Float32)...") mlmodel = ct.convert( - traced, + onnx_path, inputs=[ ct.TensorType( - name="spectral_magnitude", - shape=(1, 4, n_freq, n_frames), - ), - ct.TensorType( - name="audio_waveform", + name="mix", shape=(1, 2, segment_samples), ), ], outputs=[ - ct.TensorType(name="freq_output"), - ct.TensorType(name="time_output"), + ct.TensorType(name="sources"), ], minimum_deployment_target=ct.target.iOS16, convert_to="mlprogram", - compute_precision=ct.precision.FLOAT32, # Prevent Float16 overflow + compute_precision=ct.precision.FLOAT32, ) mlmodel.author = "Meta Research (Demucs)" mlmodel.license = "MIT License" mlmodel.short_description = ( - "HTDemucs audio source separation - separates audio into 4 stems: " - "drums, bass, vocals, other. Requires app-side STFT/iSTFT processing." + "HTDemucs audio source separation. Input: stereo mix [1,2,343980] at 44.1kHz. " + "Output: [1,8,343980] = 4 stems x 2ch. Order: drums, bass, other, vocals." +) +mlmodel.input_description["mix"] = "Stereo audio waveform [1, 2, 343980] at 44100 Hz (~7.8 seconds)" +mlmodel.output_description["sources"] = ( + "Separated stems [1, 8, 343980]. 8 channels = 4 sources x 2 stereo. " + "Source order: drums(0,1), bass(2,3), other(4,5), vocals(6,7)" ) -mlmodel.save("HTDemucs_SourceSeparation_F32.mlpackage") -print("Saved HTDemucs_SourceSeparation_F32.mlpackage") -print("Model uses Float32 to prevent frequency branch overflow.") +mlmodel.save("HTDemucs_F32.mlpackage") +print("Saved HTDemucs_F32.mlpackage") diff --git a/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj index 7f7cacc..af8b36a 100644 --- a/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj +++ b/creative_apps/DemucsDemo/DemucsDemo.xcodeproj/project.pbxproj @@ -11,6 +11,7 @@ B10003 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B10004 /* ContentView.swift */; }; B10005 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B10006 /* Assets.xcassets */; }; B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B1DM01 /* HTDemucs_SourceSeparation.mlpackage */; }; + B1DM04 /* HTDemucs_SourceSeparation_F32.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = B1DM03 /* HTDemucs_SourceSeparation_F32.mlpackage */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -20,6 +21,7 @@ B10007 /* DemucsDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = DemucsDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; B10008 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; B1DM01 /* HTDemucs_SourceSeparation.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = HTDemucs_SourceSeparation.mlpackage; sourceTree = ""; }; + B1DM03 /* HTDemucs_SourceSeparation_F32.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = HTDemucs_SourceSeparation_F32.mlpackage; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -49,6 +51,7 @@ B10006 /* Assets.xcassets */, B10008 /* Info.plist */, B1DM01 /* HTDemucs_SourceSeparation.mlpackage */, + B1DM03 /* HTDemucs_SourceSeparation_F32.mlpackage */, ); path = DemucsDemo; sourceTree = ""; @@ -133,6 +136,7 @@ B10001 /* DemucsDemoApp.swift in Sources */, B10003 /* ContentView.swift in Sources */, B1DM02 /* HTDemucs_SourceSeparation.mlpackage in Sources */, + B1DM04 /* HTDemucs_SourceSeparation_F32.mlpackage in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift index f0dad00..877f92a 100644 --- a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift +++ b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift @@ -314,7 +314,7 @@ private enum DSP { static let numBins = 2048 // fftSize / 2 static let numFrames = 336 static let segmentLength = 343980 - static let segmentOffset = 0 // Set > 0 to skip intro (e.g., 343980 = skip first ~7.8s) + static let segmentOffset = 343980 // Skip first ~7.8s to reach section with all instruments static let sampleRate: Double = 44100 // Periodic Hann window (matches PyTorch's hann_window with periodic=True) @@ -757,7 +757,7 @@ class DemucsViewModel: ObservableObject { // Load model await updateStatus("Loading model...", progress: 0.1) - guard let modelURL = Bundle.main.url(forResource: "HTDemucs_SourceSeparation", withExtension: "mlmodelc") else { + guard let modelURL = Bundle.main.url(forResource: "HTDemucs_SourceSeparation_F32", withExtension: "mlmodelc") else { throw DemucsError.modelNotFound( "HTDemucs_SourceSeparation.mlmodelc not found in bundle. " + "Please compile and add the HTDemucs_SourceSeparation.mlpackage to the project." @@ -770,7 +770,7 @@ class DemucsViewModel: ObservableObject { // Prepare model inputs await updateStatus("Preparing input...", progress: 0.2) - // spectral_magnitude [1, 4, 2048, 336] - zeros (freq branch overflows Float16) + // spectral_magnitude [1, 4, 2048, 336] - zeros (freq output is Float16, overflows for real STFT data) let spectral = try MLMultiArray(shape: [1, 4, 2048, 336], dataType: .float32) // audio_waveform [1, 2, 343980] @@ -787,7 +787,7 @@ class DemucsViewModel: ObservableObject { ]) let output = try model.prediction(from: inputFeatures) - // Extract time-domain output + // Extract time-domain output (freq_output overflows Float16 for real STFT data) guard let timeOut = output.featureValue(for: "time_output")?.multiArrayValue else { throw DemucsError.processingFailed("Missing model output") } @@ -801,7 +801,7 @@ class DemucsViewModel: ObservableObject { let i = stem.modelIndex await updateStatus("Reconstructing \(stem.rawValue)...", progress: 0.65 + Double(i) * 0.08) - // Time domain output only (freq_output overflows Float16 → ±inf) + // Time domain output only (freq_output overflows Float16 range for real STFT data) let stemLeft = DSP.extractChannel1D(from: timeOut, batch: 0, channel: 2 * i, width: DSP.segmentLength) let stemRight = DSP.extractChannel1D(from: timeOut, batch: 0, channel: 2 * i + 1, width: DSP.segmentLength) From b1fa776ee1f4d3de2ff2e7ff4c9cabddbba776cf Mon Sep 17 00:00:00 2001 From: john-rocky Date: Thu, 2 Apr 2026 10:52:33 +0900 Subject: [PATCH 18/18] Implement full freq+time hybrid reconstruction for HTDemucs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add computeSpectralInput: STFT with Python _spec padding, CaC channel format - Add inverseSpec: iSTFT matching Python _ispec (time padding + trim) - Feed actual STFT data to spectral_magnitude (was zeros, disabling freq branch) - Normalize STFT input (÷√N) to match Python torch.stft(normalized=True) - Compensate iSTFT output (×√N) for correct freq+time addition - Add stride-aware MLMultiArray fallback for non-contiguous layouts - Fix stem order: vocals=3, other=2 (matching Python model.sources) - Generalize forwardSTFT/inverseSTFT for variable frame counts --- .../DemucsDemo/DemucsDemo/ContentView.swift | 207 ++++++++++++++---- 1 file changed, 161 insertions(+), 46 deletions(-) diff --git a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift index 877f92a..12097a7 100644 --- a/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift +++ b/creative_apps/DemucsDemo/DemucsDemo/ContentView.swift @@ -15,13 +15,13 @@ enum Stem: String, CaseIterable, Identifiable { var id: String { rawValue } - // Index in model output (freq_output: 4 channels per stem, time_output: 2 channels per stem) + // Index in model output — matches Python's model.sources: [drums, bass, other, vocals] var modelIndex: Int { switch self { case .drums: return 0 case .bass: return 1 - case .vocals: return 2 - case .other: return 3 + case .other: return 2 + case .vocals: return 3 } } @@ -322,23 +322,20 @@ private enum DSP { Float(0.5 * (1.0 - cos(2.0 * .pi * Double($0) / Double(fftSize)))) } - /// Reflect-pad a signal (matches PyTorch's F.pad with mode='reflect') - static func reflectPad(signal: [Float], padSize: Int) -> [Float] { + /// Reflect-pad a signal with asymmetric left/right amounts (matches PyTorch's F.pad with mode='reflect') + static func reflectPad(signal: [Float], left: Int, right: Int) -> [Float] { let n = signal.count - var padded = [Float](repeating: 0, count: n + 2 * padSize) - // Left: signal[padSize], signal[padSize-1], ..., signal[1] - for i in 0...size) + memcpy(dst.baseAddress! + left, src.baseAddress!, n * MemoryLayout.size) } } - // Right: signal[n-2], signal[n-3], ..., signal[n-1-padSize] - for i in 0.. (real: [Float], imag: [Float]) { + /// Forward STFT using vDSP. Frame count derived from signal length. + /// Returns (real, imag) arrays in bin-major order [numBins * frames], and the frame count. + static func forwardSTFT(signal: [Float]) -> (real: [Float], imag: [Float], frames: Int) { let log2n = vDSP_Length(log2(Float(fftSize))) - guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return ([], []) } + guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return ([], [], 0) } defer { vDSP_destroy_fftsetup(fftSetup) } let halfN = fftSize / 2 - var allReal = [Float](repeating: 0, count: numBins * numFrames) - var allImag = [Float](repeating: 0, count: numBins * numFrames) + let totalFrames = max(1, (signal.count - fftSize) / hopSize + 1) + var allReal = [Float](repeating: 0, count: numBins * totalFrames) + var allImag = [Float](repeating: 0, count: numBins * totalFrames) var frame = [Float](repeating: 0, count: fftSize) var rp = [Float](repeating: 0, count: halfN) var ip = [Float](repeating: 0, count: halfN) - for f in 0.. [Float] { + let le = numFrames // 336 + // _spec padding: pad = hop_length//2 * 3, right = pad + le*hop - length + let specPadLeft = hopSize / 2 * 3 // 1536 + let specPadRight = specPadLeft + le * hopSize - segmentLength // 1620 + + let channelSize = numBins * le + var result = [Float](repeating: 0, count: 4 * channelSize) + + for (ch, signal) in [left, right].enumerated() { + // Reflect-pad matching _spec (no center padding needed — selected frames are identical) + let padded = reflectPad(signal: signal, left: specPadLeft, right: specPadRight) + let (real, imag, frames) = forwardSTFT(signal: padded) + assert(frames == le, "Expected \(le) frames, got \(frames)") + + // CaC channel layout: [L_real, L_imag, R_real, R_imag] + let realCh = ch * 2 // L_real=0, R_real=2 + let imagCh = ch * 2 + 1 // L_imag=1, R_imag=3 + result.withUnsafeMutableBufferPointer { dst in + real.withUnsafeBufferPointer { src in + memcpy(dst.baseAddress! + realCh * channelSize, src.baseAddress!, channelSize * MemoryLayout.size) + } + imag.withUnsafeBufferPointer { src in + memcpy(dst.baseAddress! + imagCh * channelSize, src.baseAddress!, channelSize * MemoryLayout.size) + } + } + } + return result + } + + /// Inverse STFT with overlap-add. Frame count derived from input array size. + /// Input: (real, imag) arrays in bin-major order [numBins * frames]. static func inverseSTFT(real: [Float], imag: [Float], outputLength: Int) -> [Float] { let log2n = vDSP_Length(log2(Float(fftSize))) guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return [] } defer { vDSP_destroy_fftsetup(fftSetup) } let halfN = fftSize / 2 + let totalFrames = real.count / numBins var output = [Float](repeating: 0, count: outputLength) var windowSum = [Float](repeating: 0, count: outputLength) var rp = [Float](repeating: 0, count: halfN) var ip = [Float](repeating: 0, count: halfN) var frame = [Float](repeating: 0, count: fftSize) - for f in 0.. [Float] { + let le = numFrames // 336 + let totalFrames = le + 4 // 340: 2 zero frames on each side (matches _ispec F.pad(z, (2, 2))) + let specPad = hopSize / 2 * 3 // 1536 + let centerPad = fftSize / 2 // 2048 + + // Pad time axis: insert 2 zero frames at start and end + var paddedReal = [Float](repeating: 0, count: numBins * totalFrames) + var paddedImag = [Float](repeating: 0, count: numBins * totalFrames) + for bin in 0...size) + } + } + imag.withUnsafeBufferPointer { src in + paddedImag.withUnsafeMutableBufferPointer { dst in + memcpy(dst.baseAddress! + dstOffset, src.baseAddress! + srcOffset, le * MemoryLayout.size) + } + } + } + + // iSTFT: 340 frames → (339 * 1024 + 4096) = 351232 samples + let rawLen = (totalFrames - 1) * hopSize + fftSize + let rawOutput = inverseSTFT(real: paddedReal, imag: paddedImag, outputLength: rawLen) + + // Trim: skip centerPad + specPad, take segmentLength samples + let trimStart = centerPad + specPad // 3584 + return Array(rawOutput[trimStart.. [Float] { let count = array.count @@ -755,23 +819,52 @@ class DemucsViewModel: ObservableObject { await updateStatus("Loading audio...", progress: 0.05) let (rawLeft, rawRight) = try DSP.loadAudio(url: url) - // Load model + // 2. Load model await updateStatus("Loading model...", progress: 0.1) guard let modelURL = Bundle.main.url(forResource: "HTDemucs_SourceSeparation_F32", withExtension: "mlmodelc") else { throw DemucsError.modelNotFound( - "HTDemucs_SourceSeparation.mlmodelc not found in bundle. " + - "Please compile and add the HTDemucs_SourceSeparation.mlpackage to the project." + "HTDemucs_SourceSeparation_F32.mlmodelc not found in bundle. " + + "Please compile and add the HTDemucs_SourceSeparation_F32.mlpackage to the project." ) } let config = MLModelConfiguration() config.computeUnits = .cpuOnly let model = try MLModel(contentsOf: modelURL, configuration: config) - // Prepare model inputs + // 3. Compute spectral magnitude from audio (matching Python _spec + _magnitude) + await updateStatus("Computing STFT...", progress: 0.15) + var spectralData = DSP.computeSpectralInput(left: rawLeft, right: rawRight) + + // Normalize to match Python's torch.stft(normalized=True): divide by sqrt(nfft) + var normScale = Float(1.0 / sqrt(Double(DSP.fftSize))) // 1/64 + vDSP_vsmul(spectralData, 1, &normScale, &spectralData, 1, vDSP_Length(spectralData.count)) + + // 4. Prepare model inputs await updateStatus("Preparing input...", progress: 0.2) - // spectral_magnitude [1, 4, 2048, 336] - zeros (freq output is Float16, overflows for real STFT data) + // spectral_magnitude [1, 4, 2048, 336] — normalized STFT data in CaC format let spectral = try MLMultiArray(shape: [1, 4, 2048, 336], dataType: .float32) + let spectralCount = 4 * DSP.numBins * DSP.numFrames + let spectralPtr = spectral.dataPointer.bindMemory(to: Float.self, capacity: spectralCount) + + // Verify strides are C-contiguous before memcpy + let spectralStrides = spectral.strides.map { $0.intValue } + if spectralStrides.last == 1 && spectralStrides[2] == DSP.numFrames { + spectralData.withUnsafeBufferPointer { src in + memcpy(spectralPtr, src.baseAddress!, spectralCount * MemoryLayout.size) + } + } else { + // Stride-aware fallback + for c in 0..<4 { + for h in 0..