SharpAI
diff --git a/‎Package.swift‎
Lines changed: 1 addition & 0 deletions b/‎Package.swift‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Sources/MLXInferenceCore/InferenceEngine.swift‎
Lines changed: 125 additions & 43 deletions b/‎Sources/MLXInferenceCore/InferenceEngine.swift‎
Lines changed: 125 additions & 43 deletions
@@ -41,6 +41,7 @@ let package = Package(
                 .product(name: "MLX", package: "mlx-swift"),
                 .product(name: "MLXLLM", package: "mlx-swift-lm"),
                 .product(name: "MLXLMCommon", package: "mlx-swift-lm"),
+                .product(name: "Hub", package: "swift-transformers"),
             ],
             path: "Sources/MLXInferenceCore",
             swiftSettings: [
 
@@ -1,12 +1,17 @@
-// InferenceEngine.swift — Core MLX inference actor for SwiftLM Chat
-// Extracted from Server.swift — no HTTP, no CLI, pure Swift concurrency.
+// InferenceEngine.swift — Core MLX inference engine for SwiftLM Chat
+// Handles: model load/unload, token streaming, memory/thermal pressure response.
 
 import Foundation
 import MLX
 import MLXLLM
 import MLXLMCommon
+import Hub
+#if canImport(UIKit)
+import UIKit
+#endif
+
+// MARK: — Model State
 
-/// The state of the inference engine.
 public enum ModelState: Equatable, Sendable {
     case idle
     case downloading(progress: Double, speed: String)
@@ -16,104 +21,194 @@ public enum ModelState: Equatable, Sendable {
     case error(String)
 }
 
-/// Token-level output from the generation stream.
+// MARK: — Thermal State
+
+public enum ThermalLevel: Sendable {
+    case nominal, fair, serious, critical
+    public var displayString: String {
+        switch self {
+        case .nominal: return "Normal"
+        case .fair:    return "Warm"
+        case .serious: return "Hot — generation may be slow"
+        case .critical: return "Critical — generation paused"
+        }
+    }
+    public var isThrottled: Bool { self == .serious || self == .critical }
+}
+
+// MARK: — Generation Token
+
 public struct GenerationToken: Sendable {
     public let text: String
-    public let isThinking: Bool   // true when inside <think>...</think>
+    public let isThinking: Bool
 
     public init(text: String, isThinking: Bool = false) {
         self.text = text
         self.isThinking = isThinking
     }
 }
 
-/// Thread-safe MLX inference engine. One instance per app.
-/// Uses Swift actor isolation so MLX calls never race.
+// MARK: — InferenceEngine
+
 @MainActor
 public final class InferenceEngine: ObservableObject {
     @Published public private(set) var state: ModelState = .idle
+    @Published public private(set) var thermalLevel: ThermalLevel = .nominal
 
-    /// Shared download manager — exposes download progress and local cache state.
+    /// Shared download + storage manager.
     public let downloadManager = ModelDownloadManager()
 
     private var container: ModelContainer?
     private var currentModelId: String?
     private var generationTask: Task<Void, Never>?
+    private var pressureObserver: NSObjectProtocol?
+    private var thermalObserver: NSObjectProtocol?
+
+    public init() {
+        setupPressureHandlers()
+    }
 
-    public init() {}
+    deinit {
+        if let o = pressureObserver { NotificationCenter.default.removeObserver(o) }
+        if let o = thermalObserver  { NotificationCenter.default.removeObserver(o) }
+    }
+
+    // MARK: — Pressure Handlers
+
+    private func setupPressureHandlers() {
+        // iOS memory pressure → unload model weights immediately
+        #if canImport(UIKit)
+        pressureObserver = NotificationCenter.default.addObserver(
+            forName: UIApplication.didReceiveMemoryWarningNotification,
+            object: nil,
+            queue: .main
+        ) { [weak self] _ in
+            Task { @MainActor [weak self] in
+                guard let self else { return }
+                // Only unload if not actively generating
+                if case .generating = self.state { return }
+                self.unload()
+                self.state = .error("Unloaded due to memory pressure. Tap to reload.")
+            }
+        }
+        #endif
+
+        // Thermal state monitoring (all platforms)
+        thermalObserver = NotificationCenter.default.addObserver(
+            forName: ProcessInfo.thermalStateDidChangeNotification,
+            object: nil,
+            queue: .main
+        ) { [weak self] _ in
+            Task { @MainActor [weak self] in
+                self?.updateThermalLevel()
+            }
+        }
+        updateThermalLevel()
+    }
+
+    private func updateThermalLevel() {
+        switch ProcessInfo.processInfo.thermalState {
+        case .nominal:  thermalLevel = .nominal
+        case .fair:     thermalLevel = .fair
+        case .serious:  thermalLevel = .serious
+        case .critical:
+            thermalLevel = .critical
+            // Critical: stop any generation immediately
+            stopGeneration()
+        @unknown default: thermalLevel = .nominal
+        }
+    }
 
     // MARK: — Model Loading
 
     /// Load a model by HuggingFace ID. Downloads if not cached.
+    /// Uses ModelStorage.cacheRoot as the HubApi download base.
     public func load(modelId: String) async {
         guard state != .ready(modelId: modelId) else { return }
+        guard !thermalLevel.isThrottled else {
+            state = .error("Device is too hot. Let it cool before loading a model.")
+            return
+        }
 
         state = .loading
         currentModelId = modelId
 
         do {
+            // Point HubApi at ModelStorage.cacheRoot so downloads land in the right
+            // place on both platforms (macOS: ~/.cache/HF, iOS: Application Support)
+            let hub = HubApi(downloadBase: ModelStorage.cacheRoot)
             let config = ModelConfiguration(id: modelId)
+
             container = try await LLMModelFactory.shared.loadContainer(
+                hub: hub,
                 configuration: config
             ) { [weak self] progress in
                 Task { @MainActor in
                     guard let self else { return }
                     let pct = progress.fractionCompleted
-                    let speedMBps = progress.throughput.map { $0 / 1_000_000 }
-                    let speedStr = speedMBps.map { String(format: "%.1f MB/s", $0) } ?? ""
+                    let speedBytesPerSec = progress.userInfo[.throughputKey] as? Double
+                    let speedStr = speedBytesPerSec
+                        .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? ""
                     self.state = .downloading(progress: pct, speed: speedStr)
+
                     self.downloadManager.updateProgress(ModelDownloadProgress(
                         modelId: modelId,
                         fractionCompleted: pct,
-                        speedMBps: speedMBps
+                        currentFile: "",
+                        speedMBps: speedBytesPerSec.map { $0 / 1_000_000 }
                     ))
                 }
             }
-            downloadManager.completeDownload(modelId: modelId)
+
+            downloadManager.clearProgress(modelId: modelId)
+            downloadManager.lastLoadedModelId = modelId
+            downloadManager.refresh()
             state = .ready(modelId: modelId)
+
         } catch {
-            downloadManager.cancelDownload(modelId: modelId)
+            downloadManager.clearProgress(modelId: modelId)
             state = .error("Failed to load \(modelId): \(error.localizedDescription)")
             container = nil
         }
     }
 
-    /// Unload the current model and free memory.
+    /// Unload the current model and free all GPU memory.
     public func unload() {
         generationTask?.cancel()
         container = nil
         currentModelId = nil
         state = .idle
-        MLX.Memory.clearCache()
+        MLX.GPU.set(cacheLimit: 0)
     }
 
     // MARK: — Generation
 
-    /// Generate a response as an AsyncStream of tokens.
-    /// Each yielded value is a `GenerationToken` (text + thinking flag).
     public nonisolated func generate(
         messages: [ChatMessage],
         config: GenerationConfig = .default
     ) -> AsyncStream<GenerationToken> {
         AsyncStream { continuation in
             Task { @MainActor in
                 guard let container = self.container else {
-                    continuation.finish()
-                    return
+                    continuation.finish(); return
+                }
+
+                // Don't generate when throttled
+                if self.thermalLevel == .critical {
+                    continuation.yield(GenerationToken(text: "\n\n[Generation paused: device temperature critical]"))
+                    continuation.finish(); return
                 }
 
                 self.state = .generating
 
                 do {
-                    let mlxMessages = messages.map { msg -> [String: String] in
-                        ["role": msg.role.rawValue, "content": msg.content]
-                    }
-
-                    // Build MLXLMCommon GenerateParameters
+                    let mlxMessages = messages.map { ["role": $0.role.rawValue, "content": $0.content] }
                     var params = GenerateParameters(temperature: config.temperature)
                     params.topP = config.topP
 
                     var thinkingActive = false
+                    var outputText = ""
+                    var tokenCount = 0
 
                     let userInput = UserInput(messages: mlxMessages)
                     let lmInput = try await container.prepare(input: userInput)
@@ -122,21 +217,15 @@ public final class InferenceEngine: ObservableObject {
                         parameters: params
                     )
 
-                    var outputText = ""
-                    var tokenCount = 0
-
                     for await generation in stream {
-                        switch generation {
-                        case .chunk(let text, tokenId: _):
+                        guard !Task.isCancelled else { break }
+
+                        if case .chunk(let text, tokenId: _) = generation {
                             outputText += text
                             tokenCount += 1
 
-                            if tokenCount >= config.maxTokens {
-                                continuation.finish()
-                                break
-                            }
+                            if tokenCount >= config.maxTokens { break }
 
-                            // Thinking state tracking (<think> tags)
                             if config.enableThinking {
                                 if outputText.contains("<think>") && !outputText.contains("</think>") {
                                     thinkingActive = true
@@ -146,13 +235,9 @@ public final class InferenceEngine: ObservableObject {
                             }
 
                             continuation.yield(GenerationToken(text: text, isThinking: thinkingActive))
-
-                        default:
-                            break
                         }
                     }
                 } catch {
-                    // Yield error as a token so the UI can display it
                     continuation.yield(GenerationToken(text: "\n\n[Error: \(error.localizedDescription)]"))
                 }
 
@@ -162,12 +247,9 @@ public final class InferenceEngine: ObservableObject {
         }
     }
 
-    /// Cancel any in-progress generation.
     public func stopGeneration() {
         generationTask?.cancel()
         generationTask = nil
-        if let id = currentModelId {
-            state = .ready(modelId: id)
-        }
+        if let id = currentModelId { state = .ready(modelId: id) }
     }
 }