Skip to content

Commit 608a387

Browse files
simbasimba
authored andcommitted
feat(moe): Expose --stream-experts flag to enable SSD inference streaming for large MoE models
1 parent dfe5bd5 commit 608a387

2 files changed

Lines changed: 20 additions & 2 deletions

File tree

Package.resolved

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Sources/mlx-server/Server.swift

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,15 @@ struct MLXServer: AsyncParsableCommand {
7979
@Flag(name: .long, help: "Force re-calibration of optimal memory settings (normally auto-cached)")
8080
var calibrate: Bool = false
8181

82+
@Flag(name: .long, help: "Enable SSD expert streaming for MoE models (Flash-MoE style memory-mapping)")
83+
var streamExperts: Bool = false
84+
8285
mutating func run() async throws {
8386
print("[mlx-server] Loading model: \(model)")
8487
let modelId = model
8588

8689
// ── Load model ──
87-
let modelConfig: ModelConfiguration
90+
var modelConfig: ModelConfiguration
8891
let fileManager = FileManager.default
8992
if fileManager.fileExists(atPath: modelId) {
9093
var isDir: ObjCBool = false
@@ -98,6 +101,11 @@ struct MLXServer: AsyncParsableCommand {
98101
} else {
99102
modelConfig = ModelConfiguration(id: modelId)
100103
}
104+
105+
// Inject streaming flag into config to bypass eval(model) if requested
106+
if self.streamExperts {
107+
modelConfig.lazyLoad = true
108+
}
101109

102110
// ── Pre-load profiling ──
103111
// Resolve model directory for profiling (checks HuggingFace cache)
@@ -193,6 +201,16 @@ struct MLXServer: AsyncParsableCommand {
193201
}
194202
}
195203

204+
// ── Apply SSD Expert Streaming ──
205+
if self.streamExperts {
206+
let streamingEnabled = await container.setStreamExperts(true)
207+
if streamingEnabled {
208+
print("[mlx-server] 💾 SSD Expert Streaming enabled (lazy load + layer-sync)")
209+
} else {
210+
print("[mlx-server] ⚠️ Model does not support SSD expert streaming")
211+
}
212+
}
213+
196214
// ── Auto-calibration (Wisdom system) ──
197215
if let plan = partitionPlan {
198216
if self.calibrate {

0 commit comments

Comments
 (0)