Skip to content

Commit 4d4ade2

Browse files
simbasimba
authored andcommitted
feat: GPU yield — prevent Metal from starving macOS WindowServer
Every 8 tokens, insert a 50μs Task.sleep to yield the GPU. This prevents heavy inference from freezing the macOS UI (WindowServer). Applied to all 4 generation loops: - Chat streaming - Chat non-streaming - Text streaming - Text non-streaming
1 parent 4086ce9 commit 4d4ade2

1 file changed

Lines changed: 16 additions & 0 deletions

File tree

Sources/mlx-server/main.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,10 @@ func handleChatStreaming(
570570
case .chunk(let text):
571571
completionTokenCount += 1
572572
fullText += text
573+
// GPU yield: prevent Metal from starving macOS WindowServer
574+
if completionTokenCount % 8 == 0 {
575+
try? await Task.sleep(for: .microseconds(50))
576+
}
573577
// ── Stop sequence check ──
574578
if let (trimmedText, _) = checkStopSequences(fullText, stopSequences: stopSequences) {
575579
let emittedSoFar = fullText.count - text.count
@@ -637,6 +641,10 @@ func handleChatNonStreaming(
637641
case .chunk(let text):
638642
fullText += text
639643
completionTokenCount += 1
644+
// GPU yield: prevent Metal from starving macOS WindowServer
645+
if completionTokenCount % 8 == 0 {
646+
try? await Task.sleep(for: .microseconds(50))
647+
}
640648
case .toolCall(let tc):
641649
let argsJson = serializeToolCallArgs(tc.function.arguments)
642650
collectedToolCalls.append(ToolCallResponse(
@@ -777,6 +785,10 @@ func handleTextStreaming(
777785
case .chunk(let text):
778786
completionTokenCount += 1
779787
fullText += text
788+
// GPU yield: prevent Metal from starving macOS WindowServer
789+
if completionTokenCount % 8 == 0 {
790+
try? await Task.sleep(for: .microseconds(50))
791+
}
780792
if let (trimmedText, _) = checkStopSequences(fullText, stopSequences: stopSequences) {
781793
let emittedSoFar = fullText.count - text.count
782794
if trimmedText.count > emittedSoFar {
@@ -830,6 +842,10 @@ func handleTextNonStreaming(
830842
case .chunk(let text):
831843
fullText += text
832844
completionTokenCount += 1
845+
// GPU yield: prevent Metal from starving macOS WindowServer
846+
if completionTokenCount % 8 == 0 {
847+
try? await Task.sleep(for: .microseconds(50))
848+
}
833849
case .toolCall, .info:
834850
break
835851
}

0 commit comments

Comments
 (0)