Skip to content

Commit 91e32af

Browse files
fix: cap Metal command buffer size during swap-assisted inference to prevent GPU timeouts
1 parent b5037f6 commit 91e32af

1 file changed

Lines changed: 3 additions & 2 deletions

File tree

Sources/SwiftLM/Server.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,7 @@ struct MLXServer: AsyncParsableCommand {
356356
// instead of weightMemoryGB * 1_073_741_824 to avoid the ~7% GiB/GB
357357
// mismatch flagged in Copilot review (weightMemoryGB = bytes / 1e9, not /2^30).
358358
let draftFootprintBytes: Int
359-
if self.streamExperts,
360-
let draftPath = self.draftModel,
359+
if let draftPath = self.draftModel,
361360
let draftDir = resolveModelDirectory(modelId: draftPath),
362361
let draftProfile = ModelProfiler.profile(modelDirectory: draftDir, modelId: draftPath) {
363362
draftFootprintBytes = draftProfile.weightFileSizeBytes
@@ -468,6 +467,7 @@ struct MLXServer: AsyncParsableCommand {
468467
print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
469468
} else {
470469
Memory.cacheLimit = plan.recommendedCacheLimit
470+
setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
471471
print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: SWAP-ASSISTED (\(String(format: "%.1f", plan.overcommitRatio))× overcommit, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
472472
for w in plan.warnings { print("[SwiftLM] \(w)") }
473473
}
@@ -479,6 +479,7 @@ struct MLXServer: AsyncParsableCommand {
479479
print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
480480
} else {
481481
Memory.cacheLimit = plan.recommendedCacheLimit
482+
setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
482483
print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: LAYER PARTITIONED (\(plan.recommendedGPULayers)/\(plan.totalLayers) GPU layers, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
483484
for w in plan.warnings { print("[SwiftLM] \(w)") }
484485
}

0 commit comments

Comments
 (0)