diff --git a/.gitignore b/.gitignore
index 98bfa6d1..5f20e3f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 __pycache__/
 *.py[cod]
 
-
 # Distribution / packaging
 workspace/
 build/
@@ -34,3 +33,4 @@ input_images*/
 src/debug_main.py
 temp*.png
 /outputs
+.idea
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 6a11c7ae..6e8b6306 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "depth-anything-3"
 version = "0.0.0"
 description = "Depth Anything 3"
 readme = "README.md"
-requires-python = ">=3.9, <=3.13"
+requires-python = ">=3.10, <=3.13"
 license = { text = "Apache-2.0" }
 authors = [{ name = "Your Name" }]
 
@@ -21,7 +21,7 @@ dependencies = [
     "imageio",
     "numpy<2",
     "opencv-python",
-    "xformers",
+    "xformers; platform_system!='Darwin'",
     "open3d",
     "fastapi",
     "uvicorn",
@@ -43,7 +43,9 @@ dependencies = [
 
 [project.optional-dependencies]
 app = ["gradio>=5", "pillow>=9.0"] # requires that python3>=3.10
-gs = ["gsplat @ git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70"]
+gs = [
+    "gsplat>=1.0.0; platform_system!='Darwin'"
+]
 all = ["depth-anything-3[app,gs]"]
 
 
diff --git a/requirements.txt b/requirements.txt
index 878db3f3..cde5c146 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ huggingface_hub
 imageio
 numpy<2
 opencv-python
-xformers
+xformers; platform_system!='Darwin'
 open3d
 fastapi
 uvicorn
diff --git a/src/depth_anything_3/api.py b/src/depth_anything_3/api.py
index d0f18aad..5583cda4 100644
--- a/src/depth_anything_3/api.py
+++ b/src/depth_anything_3/api.py
@@ -28,6 +28,7 @@
 from huggingface_hub import PyTorchModelHubMixin
 from PIL import Image
 
+from depth_anything_3.cache import get_model_cache
 from depth_anything_3.cfg import create_object, load_config
 from depth_anything_3.registry import MODEL_REGISTRY
 from depth_anything_3.specs import Prediction
@@ -72,29 +73,64 @@ class DepthAnything3(nn.Module, PyTorchModelHubMixin):
 
     _commit_hash: str | None = None  # Set by mixin when loading from Hub
 
-    def __init__(self, model_name: str = "da3-large", **kwargs):
+    def __init__(self, model_name: str = "da3-large", device: str | torch.device | None = None, use_cache: bool = True, **kwargs):
         """
         Initialize DepthAnything3 with specified preset.
 
         Args:
-        model_name: The name of the model preset to use.
-                    Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
-        **kwargs: Additional keyword arguments (currently unused).
+            model_name: The name of the model preset to use.
+                        Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
+            device: Target device ('cuda', 'mps', 'cpu'). If None, auto-detect.
+            use_cache: Whether to use model caching (default: True).
+                      Set to False to force reload model from disk.
+            **kwargs: Additional keyword arguments (currently unused).
         """
         super().__init__()
         self.model_name = model_name
+        self.use_cache = use_cache
 
-        # Build the underlying network
+        # Determine device
+        if device is None:
+            device = self._auto_detect_device()
+        self.device = torch.device(device) if isinstance(device, str) else device
+
+        # Load model configuration
         self.config = load_config(MODEL_REGISTRY[self.model_name])
-        self.model = create_object(self.config)
+
+        # Build or retrieve model from cache
+        if use_cache:
+            cache = get_model_cache()
+            self.model = cache.get(
+                model_name=self.model_name,
+                device=self.device,
+                loader_fn=lambda: self._create_model()
+            )
+        else:
+            logger.info(f"Model cache disabled, loading {self.model_name} from disk")
+            self.model = self._create_model()
+
+        # Ensure model is on correct device and in eval mode
+        self.model = self.model.to(self.device)
         self.model.eval()
 
         # Initialize processors
         self.input_processor = InputProcessor()
         self.output_processor = OutputProcessor()
 
-        # Device management (set by user)
-        self.device = None
+    def _auto_detect_device(self) -> torch.device:
+        """Auto-detect best available device."""
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            return torch.device("cpu")
+
+    def _create_model(self) -> nn.Module:
+        """Create and return new model instance."""
+        model = create_object(self.config)
+        model.eval()
+        return model
 
     @torch.inference_mode()
     def forward(
@@ -304,20 +340,33 @@ def _prepare_model_inputs(
         extrinsics: torch.Tensor | None,
         intrinsics: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
-        """Prepare tensors for model input."""
+        """
+        Prepare tensors for model input with optimized device transfer.
+
+        Uses non_blocking=True for async CPU→GPU transfers, which overlaps
+        data transfer with compute when possible.
+        """
         device = self._get_model_device()
 
-        # Move images to model device
+        # Pin memory for faster CPU→GPU transfer (CUDA only)
+        if device.type == "cuda" and imgs_cpu.device.type == "cpu":
+            imgs_cpu = imgs_cpu.pin_memory()
+
+        # Move images to model device with non-blocking transfer
         imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
 
-        # Convert camera parameters to tensors
+        # Convert camera parameters to tensors with non-blocking transfer
         ex_t = (
-            extrinsics.to(device, non_blocking=True)[None].float()
+            extrinsics.pin_memory().to(device, non_blocking=True)[None].float()
+            if extrinsics is not None and device.type == "cuda"
+            else extrinsics.to(device, non_blocking=True)[None].float()
             if extrinsics is not None
             else None
         )
         in_t = (
-            intrinsics.to(device, non_blocking=True)[None].float()
+            intrinsics.pin_memory().to(device, non_blocking=True)[None].float()
+            if intrinsics is not None and device.type == "cuda"
+            else intrinsics.to(device, non_blocking=True)[None].float()
             if intrinsics is not None
             else None
         )
diff --git a/src/depth_anything_3/cache.py b/src/depth_anything_3/cache.py
new file mode 100644
index 00000000..632b6cb4
--- /dev/null
+++ b/src/depth_anything_3/cache.py
@@ -0,0 +1,189 @@
+"""
+Model caching utilities for Depth Anything 3.
+
+Provides model caching functionality to avoid reloading model weights on every instantiation.
+This significantly reduces latency for repeated model creation (2-5s gain).
+"""
+
+from __future__ import annotations
+
+import threading
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+
+from depth_anything_3.utils.logger import logger
+
+
+class ModelCache:
+    """
+    Thread-safe singleton cache for Depth Anything 3 models.
+
+    Caches loaded model weights to avoid reloading from disk on every instantiation.
+    Each unique combination of (model_name, device) is cached separately.
+
+    Usage:
+        cache = ModelCache()
+        model = cache.get(model_name, device, loader_fn)
+        # loader_fn is only called if cache miss
+
+    Thread Safety:
+        Uses threading.Lock to ensure thread-safe access to cache.
+
+    Memory Management:
+        - Models are kept in cache until explicitly cleared
+        - Use clear() to free memory when needed
+        - Use clear_device() to clear specific device models
+    """
+
+    _instance: Optional["ModelCache"] = None
+    _lock = threading.Lock()
+
+    def __new__(cls):
+        """Singleton pattern to ensure single cache instance."""
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        """Initialize cache storage."""
+        if self._initialized:
+            return
+
+        self._cache: Dict[Tuple[str, str], nn.Module] = {}
+        self._cache_lock = threading.Lock()
+        self._initialized = True
+        logger.info("ModelCache initialized")
+
+    def get(
+        self,
+        model_name: str,
+        device: torch.device | str,
+        loader_fn: callable,
+    ) -> nn.Module:
+        """
+        Get cached model or load if not in cache.
+
+        Args:
+            model_name: Name of the model (e.g., "da3-large")
+            device: Target device (cuda, mps, cpu)
+            loader_fn: Function to load model if cache miss
+                      Should return nn.Module
+
+        Returns:
+            Cached or freshly loaded model on specified device
+
+        Example:
+            >>> cache = ModelCache()
+            >>> model = cache.get(
+            ...     "da3-large",
+            ...     "cuda",
+            ...     lambda: create_model()
+            ... )
+        """
+        device_str = str(device)
+        cache_key = (model_name, device_str)
+
+        with self._cache_lock:
+            if cache_key in self._cache:
+                logger.debug(f"Model cache HIT: {model_name} on {device_str}")
+                return self._cache[cache_key]
+
+            logger.info(f"Model cache MISS: {model_name} on {device_str}. Loading...")
+            model = loader_fn()
+            self._cache[cache_key] = model
+            logger.info(f"Model cached: {model_name} on {device_str}")
+
+            return model
+
+    def clear(self) -> None:
+        """
+        Clear entire cache and free memory.
+
+        Removes all cached models and forces garbage collection.
+        Useful when switching between many different models.
+        """
+        with self._cache_lock:
+            num_cached = len(self._cache)
+            self._cache.clear()
+
+            # Force garbage collection to free GPU memory
+            import gc
+
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            if hasattr(torch, "mps") and torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+
+            logger.info(f"Model cache cleared ({num_cached} models removed)")
+
+    def clear_device(self, device: torch.device | str) -> None:
+        """
+        Clear all models on specific device.
+
+        Args:
+            device: Device to clear (e.g., "cuda", "mps", "cpu")
+
+        Example:
+            >>> cache = ModelCache()
+            >>> cache.clear_device("cuda")  # Clear all CUDA models
+        """
+        device_str = str(device)
+
+        with self._cache_lock:
+            keys_to_remove = [key for key in self._cache if key[1] == device_str]
+            for key in keys_to_remove:
+                del self._cache[key]
+
+            # Free device memory
+            if "cuda" in device_str and torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif "mps" in device_str and hasattr(torch, "mps") and torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+
+            logger.info(f"Model cache cleared for device {device_str} ({len(keys_to_remove)} models removed)")
+
+    def get_cache_info(self) -> Dict[str, int]:
+        """
+        Get cache statistics.
+
+        Returns:
+            Dictionary with cache info:
+                - total: Total number of cached models
+                - by_device: Number of models per device
+        """
+        with self._cache_lock:
+            info = {
+                "total": len(self._cache),
+                "by_device": {},
+            }
+
+            for model_name, device_str in self._cache.keys():
+                if device_str not in info["by_device"]:
+                    info["by_device"][device_str] = 0
+                info["by_device"][device_str] += 1
+
+            return info
+
+
+# Global singleton instance
+_global_cache = ModelCache()
+
+
+def get_model_cache() -> ModelCache:
+    """
+    Get global model cache instance.
+
+    Returns:
+        Singleton ModelCache instance
+
+    Example:
+        >>> from depth_anything_3.cache import get_model_cache
+        >>> cache = get_model_cache()
+        >>> cache.clear()
+    """
+    return _global_cache
\ No newline at end of file
diff --git a/src/depth_anything_3/utils/io/input_processor.py b/src/depth_anything_3/utils/io/input_processor.py
index fa601941..6a659077 100644
--- a/src/depth_anything_3/utils/io/input_processor.py
+++ b/src/depth_anything_3/utils/io/input_processor.py
@@ -99,13 +99,16 @@ def __call__(
         proc_imgs, out_sizes, out_ixts = self._unify_batch_shapes(proc_imgs, out_sizes, out_ixts)
 
         batch_tensor = self._stack_batch(proc_imgs)
+
+        # Zero-copy conversion: torch.from_numpy shares memory with numpy array
+        # Only works when array is C-contiguous (which np.asarray ensures)
         out_exts = (
-            torch.from_numpy(np.asarray(out_exts)).float()
+            torch.from_numpy(np.ascontiguousarray(np.asarray(out_exts))).float()
             if out_exts is not None and out_exts[0] is not None
             else None
         )
         out_ixts = (
-            torch.from_numpy(np.asarray(out_ixts)).float()
+            torch.from_numpy(np.ascontiguousarray(np.asarray(out_ixts))).float()
             if out_ixts is not None and out_ixts[0] is not None
             else None
         )
diff --git a/src/depth_anything_3/utils/memory.py b/src/depth_anything_3/utils/memory.py
index 682dad76..2e269a9f 100644
--- a/src/depth_anything_3/utils/memory.py
+++ b/src/depth_anything_3/utils/memory.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 Delanoe Pirard and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 GPU memory utility helpers.
 
@@ -125,3 +139,106 @@ def estimate_memory_requirement(num_images: int, process_res: int) -> float:
     per_image_memory = (process_res / 504) ** 2 * 0.5
     total_memory = base_memory + (num_images * per_image_memory * 0.1)
     return total_memory
+
+
+# ===========================
+# Proactive Memory Management
+# ===========================
+
+
+def cleanup_mps_memory() -> None:
+    """
+    Perform proactive MPS memory cleanup.
+
+    MPS (Apple Silicon) has unified memory architecture where CPU and GPU
+    share the same memory pool. Proactive cleanup prevents fragmentation.
+    """
+    try:
+        if hasattr(torch, "mps") and torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+            gc.collect()
+            print("MPS memory cache cleared")
+    except Exception as e:
+        print(f"Warning: MPS cleanup failed: {e}")
+
+
+def cleanup_all_device_memory() -> None:
+    """
+    Clean up memory for all available devices (CUDA, MPS, CPU).
+
+    Call this between batch processing or after large allocations
+    to prevent memory fragmentation and OOM errors.
+
+    Example:
+        >>> from depth_anything_3.utils.memory import cleanup_all_device_memory
+        >>> # Process batch 1
+        >>> model.inference(images_batch1)
+        >>> cleanup_all_device_memory()  # Clean between batches
+        >>> # Process batch 2
+        >>> model.inference(images_batch2)
+    """
+    cleanup_cuda_memory()
+    cleanup_mps_memory()
+    gc.collect()
+
+
+def clear_cache_if_low_memory(threshold_gb: float = 2.0) -> bool:
+    """
+    Conditionally clear cache if available memory is below threshold.
+
+    Args:
+        threshold_gb: Memory threshold in GB (default: 2.0)
+
+    Returns:
+        True if cache was cleared, False otherwise
+
+    Example:
+        >>> from depth_anything_3.utils.memory import clear_cache_if_low_memory
+        >>> # Before large allocation
+        >>> if clear_cache_if_low_memory(threshold_gb=3.0):
+        ...     print("Low memory detected, cache cleared")
+    """
+    if torch.cuda.is_available():
+        mem_info = get_gpu_memory_info()
+        if mem_info and mem_info["free_gb"] < threshold_gb:
+            print(f"Low memory detected ({mem_info['free_gb']:.2f} GB < {threshold_gb:.2f} GB)")
+            cleanup_cuda_memory()
+            return True
+
+    elif hasattr(torch, "mps") and torch.backends.mps.is_available():
+        # MPS doesn't expose free memory easily, always clear if requested
+        cleanup_mps_memory()
+        return True
+
+    return False
+
+
+def log_memory_summary() -> None:
+    """
+    Log current memory usage summary for all devices.
+
+    Useful for debugging memory issues or understanding memory patterns.
+    """
+    if torch.cuda.is_available():
+        mem_info = get_gpu_memory_info()
+        if mem_info:
+            print(
+                f"[CUDA Memory] Allocated: {mem_info['allocated_gb']:.2f} GB, "
+                f"Reserved: {mem_info['reserved_gb']:.2f} GB, "
+                f"Free: {mem_info['free_gb']:.2f} GB / {mem_info['total_gb']:.2f} GB "
+                f"({mem_info['utilization']:.1f}% used)"
+            )
+
+    elif hasattr(torch, "mps") and torch.backends.mps.is_available():
+        try:
+            allocated = torch.mps.current_allocated_memory() / (1024**3)
+            driver_allocated = torch.mps.driver_allocated_memory() / (1024**3)
+            print(
+                f"[MPS Memory] Allocated: {allocated:.2f} GB, "
+                f"Driver Allocated: {driver_allocated:.2f} GB"
+            )
+        except Exception as e:
+            print(f"[MPS Memory] Stats unavailable: {e}")
+
+    else:
+        print("[CPU Memory] Stats not available via PyTorch")
diff --git a/src/depth_anything_3/utils/zero_copy.py b/src/depth_anything_3/utils/zero_copy.py
new file mode 100644
index 00000000..cfbb1cbb
--- /dev/null
+++ b/src/depth_anything_3/utils/zero_copy.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Delanoe Pirard and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Zero-copy utilities for efficient tensor operations.
+
+Provides utilities to minimize memory copies between NumPy and PyTorch,
+especially for CPU→GPU transfers.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+
+
+def numpy_to_torch_zerocopy(arr: np.ndarray, dtype: torch.dtype | None = None, device: str | torch.device = "cpu") -> torch.Tensor:
+    """
+    Convert NumPy array to PyTorch tensor with zero-copy when possible.
+
+    Zero-copy is possible when:
+    1. Array is C-contiguous
+    2. Target device is CPU
+    3. dtype is compatible
+
+    For GPU transfers, this still saves one copy (CPU→pinned→GPU vs CPU→CPU→GPU).
+
+    Args:
+        arr: Input NumPy array
+        dtype: Target PyTorch dtype (if None, infer from numpy dtype)
+        device: Target device ('cpu', 'cuda', 'mps')
+
+    Returns:
+        PyTorch tensor on specified device
+
+    Example:
+        >>> arr = np.random.rand(1000, 1000)
+        >>> tensor = numpy_to_torch_zerocopy(arr, device='cuda')
+        >>> # No intermediate copy on CPU if arr is C-contiguous
+    """
+    # Check if zero-copy is possible
+    is_contiguous = arr.flags['C_CONTIGUOUS']
+
+    if not is_contiguous:
+        # Need to make contiguous copy anyway
+        arr = np.ascontiguousarray(arr)
+
+    # Create tensor with zero-copy (shares memory on CPU)
+    tensor = torch.from_numpy(arr)
+
+    # Apply dtype conversion if needed
+    if dtype is not None and tensor.dtype != dtype:
+        tensor = tensor.to(dtype)
+
+    # Move to target device
+    if str(device) != "cpu":
+        # Use non_blocking for async transfer
+        tensor = tensor.to(device, non_blocking=True)
+
+    return tensor
+
+
+def ensure_pinned_memory(arr: np.ndarray) -> np.ndarray:
+    """
+    Ensure NumPy array uses pinned (page-locked) memory for faster GPU transfers.
+
+    Pinned memory allows DMA (Direct Memory Access) for faster CPU→GPU transfers.
+    Only beneficial for repeated transfers of the same data.
+
+    Args:
+        arr: Input NumPy array
+
+    Returns:
+        Array in pinned memory
+
+    Note:
+        Pinned memory is a limited resource. Only use for frequently transferred data.
+        For CUDA devices only (no effect on MPS/CPU).
+    """
+    if not torch.cuda.is_available():
+        return arr
+
+    # Convert to torch tensor with pinned memory
+    tensor = torch.from_numpy(arr).pin_memory()
+
+    # Convert back to numpy (shares pinned memory)
+    # Note: This creates a new numpy array view over pinned memory
+    return tensor.numpy()
+
+
+def stack_arrays_zerocopy(arrays: list[np.ndarray], dtype: np.dtype | None = None) -> np.ndarray:
+    """
+    Stack list of arrays with minimal copying.
+
+    Args:
+        arrays: List of NumPy arrays to stack
+        dtype: Target dtype (if None, use arrays[0].dtype)
+
+    Returns:
+        Stacked array
+
+    Note:
+        If all arrays already have compatible dtype and layout,
+        np.stack uses optimized C-level stacking.
+    """
+    if not arrays:
+        raise ValueError("Cannot stack empty list")
+
+    # Check if all arrays have compatible dtype
+    if dtype is None:
+        dtype = arrays[0].dtype
+
+    # Ensure all arrays are C-contiguous with same dtype
+    # This may create copies, but better done once than repeatedly
+    arrays_contig = []
+    for arr in arrays:
+        if arr.dtype != dtype or not arr.flags['C_CONTIGUOUS']:
+            arr = np.ascontiguousarray(arr, dtype=dtype)
+        arrays_contig.append(arr)
+
+    # Stack (single memory allocation + copy)
+    return np.stack(arrays_contig, axis=0)
+
+
+def batch_to_device(
+    tensors: list[torch.Tensor] | tuple[torch.Tensor, ...],
+    device: str | torch.device,
+    non_blocking: bool = True
+) -> list[torch.Tensor]:
+    """
+    Move multiple tensors to device with optimal settings.
+
+    Args:
+        tensors: List/tuple of tensors to move
+        device: Target device
+        non_blocking: Use async transfer (default: True)
+
+    Returns:
+        List of tensors on target device
+
+    Example:
+        >>> tensors = [torch.rand(100), torch.rand(200)]
+        >>> gpu_tensors = batch_to_device(tensors, 'cuda')
+    """
+    return [t.to(device, non_blocking=non_blocking) if t is not None else None for t in tensors]
+
+
+def get_optimal_pin_memory() -> bool:
+    """
+    Determine if pin_memory should be used for DataLoader.
+
+    Returns:
+        True if CUDA is available and pinned memory is beneficial
+
+    Usage:
+        >>> DataLoader(dataset, pin_memory=get_optimal_pin_memory())
+    """
+    return torch.cuda.is_available()
\ No newline at end of file