Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
0b84da0
Add 20 new CoreML models with sample apps
john-rocky Mar 28, 2026
4ee4dba
Add missing AppIcon.appiconset to all creative apps
john-rocky Mar 28, 2026
9b0df17
Add .DS_Store and xcuserdata to gitignore, remove tracked copies
john-rocky Mar 28, 2026
d7bf6f0
Remove tracked .DS_Store and Xcode user files
john-rocky Mar 28, 2026
f958011
Add 10 new CoreML models with sample apps and conversion scripts
john-rocky Mar 28, 2026
040d473
Add mlpackage references to Xcode projects for local testing
john-rocky Mar 30, 2026
0b203bf
Add device compatibility warning for Depth Pro demo
john-rocky Mar 30, 2026
431180a
Fix Depth Pro ANE compilation failure: use cpuAndGPU
john-rocky Mar 30, 2026
d614411
Fix Depth Pro model input/output format
john-rocky Mar 30, 2026
699a9e5
Fix DepthPro fallback: use depthStats instead of missing properties
john-rocky Mar 30, 2026
27c98a3
Remove Depth Pro (too heavy for iPhone), fix BiRefNet to 512x512
john-rocky Mar 30, 2026
f93089e
Fix BiRefNet Float16 input/output handling
john-rocky Mar 30, 2026
c98bef6
Fix BiRefNet cutout rotation: normalize image orientation before masking
john-rocky Mar 30, 2026
4fcde20
Remove apps with existing official implementations, implement LivePor…
john-rocky Mar 31, 2026
cf93a3f
Implement HTDemucs source separation pipeline with STFT/iSTFT
john-rocky Apr 1, 2026
a748acd
Add HTDemucs CoreML conversion script (Float32)
john-rocky Apr 1, 2026
7283b97
Switch to time-only reconstruction, add F32 model conversion
john-rocky Apr 1, 2026
b1fa776
Implement full freq+time hybrid reconstruction for HTDemucs
john-rocky Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# macOS / Xcode
.DS_Store
xcuserdata/
*.xcworkspace/

# CoreML model files (download from Google Drive)
*.mlpackage
*.mlmodel
*.mlmodelc
*.mlpackage/

# Converted models directory
converted_models/
creative_models/

# Python conversion scripts
convert_all.py
convert_remaining.py
__pycache__/
*.pyc
323 changes: 322 additions & 1 deletion README.md

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions conversion_scripts/convert_birefnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# BiRefNet -> CoreML conversion
# pip install torch torchvision coremltools transformers
import torch
import coremltools as ct
from transformers import AutoModelForImageSegmentation

model = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
model.eval()

dummy = torch.randn(1, 3, 1024, 1024)
traced = torch.jit.trace(model, dummy)

mlmodel = ct.convert(
traced,
inputs=[ct.ImageType(name="image", shape=(1, 3, 1024, 1024), scale=1/255.0)],
outputs=[ct.TensorType(name="mask")],
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
)
mlmodel.save("BiRefNet.mlpackage")
90 changes: 90 additions & 0 deletions conversion_scripts/convert_htdemucs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# HTDemucs -> CoreML conversion
# pip install torch torchaudio coremltools demucs
#
# The model takes raw stereo audio and outputs 4 separated stems directly.
# All STFT/iSTFT/normalization is handled internally by the model.
#
# Input: mix [1, 2, 343980] - stereo audio at 44100Hz (~7.8s)
# Output: sources [1, 4, 2, 343980] - 4 stems (drums, bass, other, vocals), stereo
#
# Uses Float32 to prevent overflow in the frequency branch.

import torch
import coremltools as ct
from demucs.pretrained import get_model

# Load HTDemucs
bag = get_model("htdemucs")
model = bag.models[0]
model.eval()

segment_samples = int(model.segment * model.samplerate) # 343980
print(f"sources: {model.sources}")
print(f"segment_samples: {segment_samples}")
print(f"samplerate: {model.samplerate}")

# Wrapper to flatten output from [1,4,2,T] to [1,8,T] for CoreML compatibility
class HTDemucsExport(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model

def forward(self, mix):
# mix: [1, 2, T]
# output: [1, 4, 2, T] -> [1, 8, T]
x = self.model(mix)
B, S, C, T = x.shape
return x.reshape(B, S * C, T)

wrapper = HTDemucsExport(model)
wrapper.eval()

# Export via ONNX to avoid coremltools int op conversion bug
print("Exporting to ONNX...")
dummy = torch.randn(1, 2, segment_samples)
onnx_path = "HTDemucs_F32.onnx"

with torch.no_grad():
torch.onnx.export(
wrapper,
dummy,
onnx_path,
input_names=["mix"],
output_names=["sources"],
opset_version=17,
do_constant_folding=True,
)
print(f"Saved ONNX: {onnx_path}")

# Convert ONNX to CoreML with Float32
print("Converting ONNX to CoreML (Float32)...")
mlmodel = ct.convert(
onnx_path,
inputs=[
ct.TensorType(
name="mix",
shape=(1, 2, segment_samples),
),
],
outputs=[
ct.TensorType(name="sources"),
],
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
compute_precision=ct.precision.FLOAT32,
)

mlmodel.author = "Meta Research (Demucs)"
mlmodel.license = "MIT License"
mlmodel.short_description = (
"HTDemucs audio source separation. Input: stereo mix [1,2,343980] at 44.1kHz. "
"Output: [1,8,343980] = 4 stems x 2ch. Order: drums, bass, other, vocals."
)
mlmodel.input_description["mix"] = "Stereo audio waveform [1, 2, 343980] at 44100 Hz (~7.8 seconds)"
mlmodel.output_description["sources"] = (
"Separated stems [1, 8, 343980]. 8 channels = 4 sources x 2 stereo. "
"Source order: drums(0,1), bass(2,3), other(4,5), vocals(6,7)"
)

mlmodel.save("HTDemucs_F32.mlpackage")
print("Saved HTDemucs_F32.mlpackage")
29 changes: 29 additions & 0 deletions conversion_scripts/convert_kokoro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Kokoro-82M -> CoreML conversion
# Pre-converted CoreML model available at: https://huggingface.co/FluidInference/kokoro-82m-coreml
# iOS Swift package: https://github.com/mlalma/kokoro-ios
#
# Manual conversion:
# pip install torch coremltools kokoro

import torch
import coremltools as ct

# Kokoro has a two-stage pipeline: Duration Predictor + Decoder
# The model uses StyleTTS2-based architecture with ISTFTNet decoder

# Download from HuggingFace
from huggingface_hub import hf_hub_download
import json

# Load the model
repo_id = "hexgrad/Kokoro-82M"
model_path = hf_hub_download(repo_id, "kokoro-v1.0.onnx")

# Convert from ONNX to CoreML
mlmodel = ct.converters.convert(
model_path,
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
)
mlmodel.save("Kokoro82M.mlpackage")
print("Saved Kokoro82M.mlpackage")
34 changes: 34 additions & 0 deletions conversion_scripts/convert_ppocr_v5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# PP-OCRv5 -> CoreML conversion
# PP-OCRv5 by Baidu PaddlePaddle - Ultra lightweight multilingual OCR
# https://github.com/PaddlePaddle/PaddleOCR
# pip install paddlepaddle paddleocr torch coremltools onnx

# Step 1: Export PaddleOCR to ONNX using paddle2onnx
# pip install paddle2onnx
# paddle2onnx --model_dir ./PP-OCRv5_det --model_filename inference.pdmodel \
# --params_filename inference.pdiparams --save_file ppocrv5_det.onnx

# Step 2: Convert ONNX to CoreML
import coremltools as ct
import onnx

# Detection model
det_onnx = onnx.load("ppocrv5_det.onnx")
det_ml = ct.converters.convert(
det_onnx,
inputs=[ct.ImageType(name="image", shape=(1, 3, 640, 640), scale=1/255.0)],
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
)
det_ml.save("PPOCRv5_Det.mlpackage")

# Recognition model
rec_onnx = onnx.load("ppocrv5_rec.onnx")
rec_ml = ct.converters.convert(
rec_onnx,
inputs=[ct.TensorType(name="image", shape=(1, 3, 48, 320))],
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
)
rec_ml.save("PPOCRv5_Rec.mlpackage")
print("Saved PPOCRv5_Det.mlpackage and PPOCRv5_Rec.mlpackage")
35 changes: 35 additions & 0 deletions conversion_scripts/convert_smolvlm2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SmolVLM2-500M -> CoreML conversion
# pip install torch coremltools transformers accelerate

import torch
import coremltools as ct
from transformers import AutoProcessor, AutoModelForVision2Seq

model_name = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32)
model.eval()

# Note: VLM conversion to CoreML is complex due to autoregressive generation.
# For production use, consider:
# 1. Export vision encoder separately
# 2. Export language model separately
# 3. Use MLX Swift for on-device inference (proven to work on iPhone)
#
# Vision Encoder conversion:
vision_encoder = model.model.vision_model
dummy_pixel = torch.randn(1, 3, 384, 384)
traced_vision = torch.jit.trace(vision_encoder, dummy_pixel)

vision_ml = ct.convert(
traced_vision,
inputs=[ct.ImageType(name="pixel_values", shape=(1, 3, 384, 384), scale=1/255.0)],
outputs=[ct.TensorType(name="image_features")],
minimum_deployment_target=ct.target.iOS16,
convert_to="mlprogram",
)
vision_ml.save("SmolVLM2_VisionEncoder.mlpackage")
print("Saved SmolVLM2_VisionEncoder.mlpackage")

# For the full model, consider using GGUF format with llama.cpp or MLX Swift
# GGUF models available at: https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
15 changes: 15 additions & 0 deletions conversion_scripts/convert_yoloe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# YOLOE-S -> CoreML conversion
# YOLOE: Real-Time Seeing Anything (ICCV 2025)
# https://github.com/THU-MIG/yoloe
# pip install ultralytics

from ultralytics import YOLO

# YOLOE-S with text prompt capability
model = YOLO("yoloe-11s-seg.pt")
model.export(format="coreml", imgsz=640, half=True)
print("Exported YOLOE-S to CoreML format")

# Alternative: Export with ONNX first then convert
# model.export(format="onnx", imgsz=640)
# Then use coremltools to convert ONNX -> CoreML
26 changes: 26 additions & 0 deletions conversion_scripts/convert_yolov10.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
Convert YOLOv10-N (Nano) to CoreML format.

Requirements:
pip install ultralytics

The exported model will be saved alongside the .pt weights as
yolov10n.mlpackage. Drag it into the Xcode project so the compiler
produces the bundled .mlmodelc at build time.

Usage:
python convert_yolov10.py
"""

from ultralytics import YOLO

# Download (if needed) and load the pretrained YOLOv10-N weights
model = YOLO("yolov10n.pt")

# Export to CoreML
# - imgsz : input resolution expected by the model
# - half : use float16 for smaller model size on device
# - nms : disable built-in NMS (YOLOv10 is NMS-free by design)
model.export(format="coreml", imgsz=640, half=True, nms=False)

print("CoreML conversion complete. Look for yolov10n.mlpackage")
Loading