feat: wire GPU/CPU layer partitioning to --gpu-layers flag

simba · simba · commit 29e1d64f306e · 2026-03-27T14:15:34.000-07:00
Phase 2 integration — connects the mlx-swift-lm fork's new
LayerPartitionable protocol to mlx-server's CLI and profiler:

- --gpu-layers N: explicitly set N layers on GPU, rest on CPU
- --gpu-layers auto: use partition plan recommendation
- Auto-partition: when model exceeds available RAM (overcommit &gt; 1.0),
  automatically applies the recommended GPU layer count

- PartitionPlan: added mutable gpuLayers field (updated after actual
  partitioning) and cpu_layers in /health response

- Fixed .chunk API change in latest fork (now returns tokenId tuple)

- Updated Package.swift comment to note partitioning support
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ DerivedData/
 # IDE
 .vscode/
 .idea/
+mlx-swift-lm
diff --git a/Package.resolved b/Package.resolved
diff --git a/Package.swift b/Package.swift
@@ -7,8 +7,7 @@ let package = Package(
     dependencies: [
         // Apple MLX Swift — core inference engine (Apple-maintained, tagged releases)
         .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.6")),
-        // Apple's LLM library built on MLX Swift (SharpAI fork)
-        // Pinned to main branch for Qwen3.5 support (PRs #97, #120, #129, #133, #135 — not yet in a release tag)
+        // Apple's LLM library built on MLX Swift (SharpAI fork — with GPU/CPU layer partitioning)
         .package(url: "https://github.com/SharpAI/mlx-swift-lm", branch: "main"),
         // HuggingFace tokenizers + model download
         .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.2.0")),
diff --git a/Sources/mlx-server/ModelProfiler.swift b/Sources/mlx-server/ModelProfiler.swift
@@ -136,6 +136,9 @@ struct PartitionPlan: Sendable {
     let estimatedTokensPerSec: Double
     let warnings: [String]
 
+    /// Actual GPU layers after partitioning (updated by server after model load)
+    var gpuLayers: Int
+
     var fitsInMemory: Bool { strategy == .fullGPU }
 
     /// JSON-compatible dictionary for the /health endpoint
@@ -147,7 +150,8 @@ struct PartitionPlan: Sendable {
             "kv_cache_gb": round(kvCacheMemoryGB * 10) / 10,
             "total_required_gb": round(totalRequiredGB * 10) / 10,
             "system_ram_gb": round(systemRAMGB * 10) / 10,
-            "gpu_layers": recommendedGPULayers,
+            "gpu_layers": gpuLayers,
+            "cpu_layers": totalLayers - gpuLayers,
             "total_layers": totalLayers,
             "estimated_tok_s": round(estimatedTokensPerSec * 10) / 10,
         ]
@@ -409,7 +413,8 @@ enum ModelProfiler {
             recommendedMemoryLimit: memoryLimit,
             recommendedCacheLimit: cacheLimit,
             estimatedTokensPerSec: estimatedSpeed,
-            warnings: warnings
+            warnings: warnings,
+            gpuLayers: gpuLayers  // Initially same as recommended; updated after actual partitioning
         )
     }
 
diff --git a/Sources/mlx-server/Server.swift b/Sources/mlx-server/Server.swift
@@ -124,7 +124,6 @@ struct MLXServer: AsyncParsableCommand {
             case .layerPartitioned:
                 Memory.cacheLimit = plan.recommendedCacheLimit
                 print("[mlx-server] \(plan.strategy.emoji) Memory strategy: LAYER PARTITIONED (\(plan.recommendedGPULayers)/\(plan.totalLayers) GPU layers)")
-                print("[mlx-server]    Note: GPU/CPU layer split requires --gpu-layers support (coming soon)")
                 for w in plan.warnings { print("[mlx-server]    \(w)") }
             case .tooLarge:
                 Memory.cacheLimit = plan.recommendedCacheLimit
@@ -136,13 +135,26 @@ struct MLXServer: AsyncParsableCommand {
             return
         }
 
-        // --gpu-layers validation (accept now, prepare for Phase 2)
-        if let gpuLayersArg = self.gpuLayers, gpuLayersArg != "auto" {
-            if let n = Int(gpuLayersArg) {
-                print("[mlx-server] --gpu-layers \(n) requested. Note: layer-level CPU/GPU split is under development.")
+        // ── Determine GPU layer count ──
+        // Priority: 1) explicit --gpu-layers flag, 2) partition plan auto, 3) nil (all GPU)
+        var requestedGPULayers: Int? = nil
+        if let gpuLayersArg = self.gpuLayers {
+            if gpuLayersArg == "auto" {
+                // Use partition plan recommendation if available
+                requestedGPULayers = partitionPlan?.recommendedGPULayers
+                print("[mlx-server] --gpu-layers auto → \(requestedGPULayers.map(String.init) ?? "all") layers on GPU")
+            } else if let n = Int(gpuLayersArg) {
+                requestedGPULayers = n
+                print("[mlx-server] --gpu-layers \(n) → \(n) layers on GPU")
             } else {
-                print("[mlx-server] Warning: --gpu-layers must be 'auto' or an integer, got '\(gpuLayersArg)'. Using auto.")
+                print("[mlx-server] Warning: --gpu-layers must be 'auto' or an integer, got '\(gpuLayersArg)'. Using all GPU.")
             }
+        } else if let plan = partitionPlan,
+                  (plan.strategy == .layerPartitioned || plan.strategy == .swapAssisted),
+                  plan.overcommitRatio > 1.0 {
+            // Auto-partition when model exceeds available RAM (no flag needed)
+            requestedGPULayers = plan.recommendedGPULayers
+            print("[mlx-server] Auto-partitioning: \(plan.recommendedGPULayers)/\(plan.totalLayers) layers on GPU")
         }
 
         let isVision = self.vision
@@ -164,6 +176,20 @@ struct MLXServer: AsyncParsableCommand {
             }
         }
 
+        // ── Apply GPU/CPU layer partitioning ──
+        if let gpuCount = requestedGPULayers {
+            let actual = await container.setGPULayers(gpuCount)
+            if let actual {
+                let total = partitionPlan?.totalLayers ?? actual
+                let cpuCount = total - actual
+                print("[mlx-server] 🔀 Layer split active: \(actual) GPU / \(cpuCount) CPU")
+                // Update the partition plan to reflect actual split
+                partitionPlan?.gpuLayers = actual
+            } else {
+                print("[mlx-server] ⚠️  Model does not support layer partitioning (architecture not yet adapted)")
+            }
+        }
+
         print("[mlx-server] Model loaded. Starting HTTP server on \(host):\(port)")
 
         // ── Capture CLI defaults into a shared config ──
@@ -721,7 +747,7 @@ func handleChatStreaming(
         for await generation in stream {
             if stopped { break }
             switch generation {
-            case .chunk(let text):
+            case .chunk(let text, _):
                 completionTokenCount += 1
                 fullText += text
                 // GPU yield: prevent Metal from starving macOS WindowServer
@@ -792,7 +818,7 @@ func handleChatNonStreaming(
     var tcIndex = 0
     for await generation in stream {
         switch generation {
-        case .chunk(let text):
+        case .chunk(let text, _):
             fullText += text
             completionTokenCount += 1
             // GPU yield: prevent Metal from starving macOS WindowServer
@@ -936,7 +962,7 @@ func handleTextStreaming(
         for await generation in stream {
             if stopped { break }
             switch generation {
-            case .chunk(let text):
+            case .chunk(let text, _):
                 completionTokenCount += 1
                 fullText += text
                 // GPU yield: prevent Metal from starving macOS WindowServer
@@ -993,7 +1019,7 @@ func handleTextNonStreaming(
     var completionTokenCount = 0
     for await generation in stream {
         switch generation {
-        case .chunk(let text):
+        case .chunk(let text, _):
             fullText += text
             completionTokenCount += 1
             // GPU yield: prevent Metal from starving macOS WindowServer