Skip to content

Commit f730030

Browse files
authored
Merge pull request #142 from SharpAI/feature/apple-npu-detection
feat(detection): route Apple Silicon YOLO to Neural Engine (NPU)
2 parents 679b017 + 18802d6 commit f730030

File tree

5 files changed

+447
-17
lines changed

5 files changed

+447
-17
lines changed

skills/detection/yolo-detection-2026/SKILL.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ parameters:
6666
description: "Auto-convert model to optimized format for faster inference"
6767
group: Performance
6868

69+
- name: compute_units
70+
label: "Apple Compute Units"
71+
type: select
72+
options: ["auto", "cpu_and_ne", "all", "cpu_only", "cpu_and_gpu"]
73+
default: "auto"
74+
description: "CoreML compute target — 'auto' routes to Neural Engine (NPU), leaving GPU free for LLM/VLM"
75+
group: Performance
76+
platform: macos
77+
6978
capabilities:
7079
live_detection:
7180
script: scripts/detect.py
@@ -89,13 +98,15 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o
8998

9099
The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached.
91100

92-
| Platform | Backend | Optimized Format | Expected Speedup |
93-
|----------|---------|------------------|:----------------:|
94-
| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x |
95-
| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x |
96-
| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x |
97-
| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x |
98-
| CPU (any) | CPU | ONNX Runtime | ~1.5x |
101+
| Platform | Backend | Optimized Format | Compute Units | Expected Speedup |
102+
|----------|---------|------------------|:-------------:|:----------------:|
103+
| NVIDIA GPU | CUDA | TensorRT `.engine` | GPU | ~3-5x |
104+
| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | **Neural Engine** (NPU) | ~2x |
105+
| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | CPU/GPU/NPU | ~2-3x |
106+
| AMD GPU | ROCm | ONNX Runtime | GPU | ~1.5-2x |
107+
| CPU (any) | CPU | ONNX Runtime | CPU | ~1.5x |
108+
109+
> **Apple Silicon Note**: Detection defaults to `cpu_and_ne` (CPU + Neural Engine), keeping the GPU free for LLM/VLM inference. Set `compute_units: all` to include GPU if not running local LLM.
99110
100111
### How It Works
101112

skills/detection/yolo-detection-2026/scripts/detect.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def main():
248248
perf.model_load_ms = env.load_ms
249249
perf.export_ms = env.export_ms
250250

251-
emit({
251+
ready_event = {
252252
"event": "ready",
253253
"model": f"yolo2026{model_size[0]}",
254254
"model_size": model_size,
@@ -260,7 +260,10 @@ def main():
260260
"fps": fps,
261261
"model_load_ms": round(env.load_ms, 1),
262262
"available_sizes": list(MODEL_SIZE_MAP.keys()),
263-
})
263+
}
264+
if hasattr(env, 'compute_units') and env.backend == "mps":
265+
ready_event["compute_units"] = env.compute_units
266+
emit(ready_event)
264267
except Exception as e:
265268
emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
266269
sys.exit(1)

skills/detection/yolo-detection-2026/scripts/env_config.py

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class BackendSpec:
4040
model_suffix: str # file extension/dir to look for cached model
4141
half: bool = True # use FP16
4242
extra_export_args: dict = field(default_factory=dict)
43+
compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc.
4344

4445

4546
BACKEND_SPECS = {
@@ -61,6 +62,7 @@ class BackendSpec:
6162
model_suffix=".mlpackage",
6263
half=True,
6364
extra_export_args={"nms": False},
65+
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
6466
),
6567
"intel": BackendSpec(
6668
name="intel",
@@ -86,6 +88,7 @@ class HardwareEnv:
8688
backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu"
8789
device: str = "cpu" # torch device string
8890
export_format: str = "onnx" # optimal export format
91+
compute_units: str = "all" # CoreML compute units (Apple only)
8992
gpu_name: str = "" # human-readable GPU name
9093
gpu_memory_mb: int = 0 # GPU memory in MB
9194
driver_version: str = "" # GPU driver version
@@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
113116
else:
114117
env._fallback_cpu()
115118

116-
# Set export format from backend spec
119+
# Set export format and compute units from backend spec
117120
spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
118121
env.export_format = spec.export_format
122+
if spec.compute_units:
123+
env.compute_units = spec.compute_units
119124

120125
# Check if optimized runtime is available
121126
env.framework_ok = env._check_framework()
@@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:
439444

440445
return None
441446

447+
def _load_coreml_with_compute_units(self, model_path: str):
448+
"""
449+
Load a CoreML model via YOLO with specific compute_units.
450+
451+
Monkey-patches coremltools.MLModel to inject compute_units
452+
(e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
453+
expose this parameter. Patch is scoped and immediately restored.
454+
"""
455+
from ultralytics import YOLO
456+
457+
# Map string config → coremltools enum
458+
_COMPUTE_UNIT_MAP = {
459+
"all": "ALL",
460+
"cpu_only": "CPU_ONLY",
461+
"cpu_and_gpu": "CPU_AND_GPU",
462+
"cpu_and_ne": "CPU_AND_NE",
463+
}
464+
465+
ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
466+
if not ct_enum_name:
467+
_log(f"Unknown compute_units '{self.compute_units}', using default")
468+
return YOLO(model_path)
469+
470+
try:
471+
import coremltools as ct
472+
target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
473+
if target_units is None:
474+
_log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
475+
return YOLO(model_path)
476+
477+
# Temporarily patch MLModel to inject compute_units
478+
_OrigMLModel = ct.models.MLModel
479+
480+
class _PatchedMLModel(_OrigMLModel):
481+
def __init__(self, *args, **kwargs):
482+
kwargs.setdefault('compute_units', target_units)
483+
super().__init__(*args, **kwargs)
484+
485+
ct.models.MLModel = _PatchedMLModel
486+
try:
487+
model = YOLO(model_path)
488+
finally:
489+
ct.models.MLModel = _OrigMLModel # Always restore
490+
491+
_log(f"CoreML model loaded with compute_units={ct_enum_name} "
492+
f"(Neural Engine preferred)")
493+
return model
494+
495+
except ImportError:
496+
_log("coremltools not available, loading without compute_units")
497+
return YOLO(model_path)
498+
442499
def load_optimized(self, model_name: str, use_optimized: bool = True):
443500
"""
444501
Load the best available model for this hardware.
@@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
455512
optimized_path = self.get_optimized_path(model_name)
456513
if optimized_path.exists():
457514
try:
458-
model = YOLO(str(optimized_path))
515+
# On Apple Silicon: route CoreML to Neural Engine
516+
if self.backend == "mps" and self.compute_units != "all":
517+
model = self._load_coreml_with_compute_units(
518+
str(optimized_path))
519+
else:
520+
model = YOLO(str(optimized_path))
459521
self.load_ms = (time.perf_counter() - t0) * 1000
460522
_log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
461523
return model, self.export_format
@@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467529
exported = self.export_model(pt_model, model_name)
468530
if exported:
469531
try:
470-
model = YOLO(str(exported))
532+
# On Apple Silicon: route CoreML to Neural Engine
533+
if self.backend == "mps" and self.compute_units != "all":
534+
model = self._load_coreml_with_compute_units(
535+
str(exported))
536+
else:
537+
model = YOLO(str(exported))
471538
self.load_ms = (time.perf_counter() - t0) * 1000
472539
_log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
473540
return model, self.export_format
@@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
508575

509576
def to_dict(self) -> dict:
510577
"""Serialize environment info for JSON output."""
511-
return {
578+
d = {
512579
"backend": self.backend,
513580
"device": self.device,
514581
"export_format": self.export_format,
@@ -519,6 +586,9 @@ def to_dict(self) -> dict:
519586
"export_ms": round(self.export_ms, 1),
520587
"load_ms": round(self.load_ms, 1),
521588
}
589+
if self.backend == "mps":
590+
d["compute_units"] = self.compute_units
591+
return d
522592

523593

524594
# ─── CLI: run standalone for diagnostics ─────────────────────────────────────

skills/lib/env_config.py

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class BackendSpec:
4040
model_suffix: str # file extension/dir to look for cached model
4141
half: bool = True # use FP16
4242
extra_export_args: dict = field(default_factory=dict)
43+
compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc.
4344

4445

4546
BACKEND_SPECS = {
@@ -61,6 +62,7 @@ class BackendSpec:
6162
model_suffix=".mlpackage",
6263
half=True,
6364
extra_export_args={"nms": False},
65+
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
6466
),
6567
"intel": BackendSpec(
6668
name="intel",
@@ -86,6 +88,7 @@ class HardwareEnv:
8688
backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu"
8789
device: str = "cpu" # torch device string
8890
export_format: str = "onnx" # optimal export format
91+
compute_units: str = "all" # CoreML compute units (Apple only)
8992
gpu_name: str = "" # human-readable GPU name
9093
gpu_memory_mb: int = 0 # GPU memory in MB
9194
driver_version: str = "" # GPU driver version
@@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
113116
else:
114117
env._fallback_cpu()
115118

116-
# Set export format from backend spec
119+
# Set export format and compute units from backend spec
117120
spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
118121
env.export_format = spec.export_format
122+
if spec.compute_units:
123+
env.compute_units = spec.compute_units
119124

120125
# Check if optimized runtime is available
121126
env.framework_ok = env._check_framework()
@@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:
439444

440445
return None
441446

447+
def _load_coreml_with_compute_units(self, model_path: str):
448+
"""
449+
Load a CoreML model via YOLO with specific compute_units.
450+
451+
Monkey-patches coremltools.MLModel to inject compute_units
452+
(e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
453+
expose this parameter. Patch is scoped and immediately restored.
454+
"""
455+
from ultralytics import YOLO
456+
457+
# Map string config → coremltools enum
458+
_COMPUTE_UNIT_MAP = {
459+
"all": "ALL",
460+
"cpu_only": "CPU_ONLY",
461+
"cpu_and_gpu": "CPU_AND_GPU",
462+
"cpu_and_ne": "CPU_AND_NE",
463+
}
464+
465+
ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
466+
if not ct_enum_name:
467+
_log(f"Unknown compute_units '{self.compute_units}', using default")
468+
return YOLO(model_path)
469+
470+
try:
471+
import coremltools as ct
472+
target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
473+
if target_units is None:
474+
_log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
475+
return YOLO(model_path)
476+
477+
# Temporarily patch MLModel to inject compute_units
478+
_OrigMLModel = ct.models.MLModel
479+
480+
class _PatchedMLModel(_OrigMLModel):
481+
def __init__(self, *args, **kwargs):
482+
kwargs.setdefault('compute_units', target_units)
483+
super().__init__(*args, **kwargs)
484+
485+
ct.models.MLModel = _PatchedMLModel
486+
try:
487+
model = YOLO(model_path)
488+
finally:
489+
ct.models.MLModel = _OrigMLModel # Always restore
490+
491+
_log(f"CoreML model loaded with compute_units={ct_enum_name} "
492+
f"(Neural Engine preferred)")
493+
return model
494+
495+
except ImportError:
496+
_log("coremltools not available, loading without compute_units")
497+
return YOLO(model_path)
498+
442499
def load_optimized(self, model_name: str, use_optimized: bool = True):
443500
"""
444501
Load the best available model for this hardware.
@@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
455512
optimized_path = self.get_optimized_path(model_name)
456513
if optimized_path.exists():
457514
try:
458-
model = YOLO(str(optimized_path))
515+
# On Apple Silicon: route CoreML to Neural Engine
516+
if self.backend == "mps" and self.compute_units != "all":
517+
model = self._load_coreml_with_compute_units(
518+
str(optimized_path))
519+
else:
520+
model = YOLO(str(optimized_path))
459521
self.load_ms = (time.perf_counter() - t0) * 1000
460522
_log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
461523
return model, self.export_format
@@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
467529
exported = self.export_model(pt_model, model_name)
468530
if exported:
469531
try:
470-
model = YOLO(str(exported))
532+
# On Apple Silicon: route CoreML to Neural Engine
533+
if self.backend == "mps" and self.compute_units != "all":
534+
model = self._load_coreml_with_compute_units(
535+
str(exported))
536+
else:
537+
model = YOLO(str(exported))
471538
self.load_ms = (time.perf_counter() - t0) * 1000
472539
_log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
473540
return model, self.export_format
@@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
508575

509576
def to_dict(self) -> dict:
510577
"""Serialize environment info for JSON output."""
511-
return {
578+
d = {
512579
"backend": self.backend,
513580
"device": self.device,
514581
"export_format": self.export_format,
@@ -519,6 +586,9 @@ def to_dict(self) -> dict:
519586
"export_ms": round(self.export_ms, 1),
520587
"load_ms": round(self.load_ms, 1),
521588
}
589+
if self.backend == "mps":
590+
d["compute_units"] = self.compute_units
591+
return d
522592

523593

524594
# ─── CLI: run standalone for diagnostics ─────────────────────────────────────

0 commit comments

Comments
 (0)