Skip to content

Commit 2707be9

Browse files
fix: prevent Metal GPU Watchdog timeout on low-RAM CI runners
- Move MLX_MAX_OPS_PER_BUFFER=50 to top of run() before Metal init - Enable --stream-experts automatically on <12GB machines in test-dflash.sh so weights are paged via mmap/pread instead of macOS VM swap - Auto-cap draft tokens to 1 under SSD streaming (minimal fan-out) - Always compute draftFootprintBytes regardless of --stream-experts flag
1 parent 91e32af commit 2707be9

2 files changed

Lines changed: 30 additions & 6 deletions

File tree

Sources/SwiftLM/Server.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,15 @@ struct MLXServer: AsyncParsableCommand {
290290
setrlimit(RLIMIT_NOFILE, &rl)
291291
}
292292

293+
// Cap Metal command buffer size BEFORE any MLX operation to prevent the
294+
// 5-second Apple GPU Watchdog from killing processes under swap pressure.
295+
// This env var must be set before MLX's Metal backend initializes.
296+
// Value 50 splits large computation graphs into ~1-layer chunks so macOS
297+
// can page in weights incrementally without exceeding the watchdog timeout.
298+
if self.draftModel != nil || self.streamExperts {
299+
setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1)
300+
}
301+
293302
// Register SwiftLM-owned DFlash model types before any model loading.
294303
await registerDFlashModelTypes()
295304

@@ -467,7 +476,6 @@ struct MLXServer: AsyncParsableCommand {
467476
print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
468477
} else {
469478
Memory.cacheLimit = plan.recommendedCacheLimit
470-
setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
471479
print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: SWAP-ASSISTED (\(String(format: "%.1f", plan.overcommitRatio))× overcommit, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
472480
for w in plan.warnings { print("[SwiftLM] \(w)") }
473481
}
@@ -479,7 +487,6 @@ struct MLXServer: AsyncParsableCommand {
479487
print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
480488
} else {
481489
Memory.cacheLimit = plan.recommendedCacheLimit
482-
setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
483490
print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: LAYER PARTITIONED (\(plan.recommendedGPULayers)/\(plan.totalLayers) GPU layers, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
484491
for w in plan.warnings { print("[SwiftLM] \(w)") }
485492
}

tests/test-dflash.sh

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,23 @@ fi
6666
TOTAL_RAM_GB=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f", $1 / 1073741824}')
6767
log "System RAM: ${TOTAL_RAM_GB} GB"
6868

69-
if [ "$TOTAL_RAM_GB" -lt 8 ] 2>/dev/null; then
70-
log "⚠️ WARNING: ${TOTAL_RAM_GB} GB RAM detected. Dual-model test requires ~6 GB."
71-
log " Consider running on a machine with ≥8 GB RAM."
69+
# On low-RAM machines (< 12 GB), the combined main + draft model weights
70+
# (~6 GB) exceed available memory after OS reservation. Without SSD
71+
# streaming, all weights must be GPU-resident or swapped via macOS VM,
72+
# which causes Metal command buffers to exceed Apple's 5-second GPU
73+
# Watchdog timeout → Abort trap: 6.
74+
#
75+
# Fix: enable --stream-experts on low-RAM machines. This uses mmap-based
76+
# weight loading (pread from SSD via the OS page cache) so the GPU never
77+
# stalls waiting for swap. Draft tokens are auto-capped to 1 server-side
78+
# to minimise SSD I/O fan-out during the verify pass.
79+
EXTRA_FLAGS=""
80+
if [ "$TOTAL_RAM_GB" -lt 12 ] 2>/dev/null; then
81+
log "⚠️ ${TOTAL_RAM_GB} GB RAM: enabling --stream-experts for SSD-backed weight paging"
82+
log " Combined model weights (~6 GB) exceed available RAM. SSD streaming prevents"
83+
log " Metal GPU Watchdog timeouts during DFlash verify passes."
84+
EXTRA_FLAGS="--stream-experts"
85+
NUM_DRAFT_TOKENS=1 # auto-capped server-side too, but be explicit
7286
fi
7387

7488
# ══════════════════════════════════════════════════════════════════════
@@ -83,11 +97,14 @@ log "Starting server with DFlash speculative decoding..."
8397
log " Main model: $MAIN_MODEL"
8498
log " Draft model: $DRAFT_MODEL"
8599
log " Draft tokens per round: $NUM_DRAFT_TOKENS"
100+
if [ -n "$EXTRA_FLAGS" ]; then
101+
log " Extra flags: $EXTRA_FLAGS"
102+
fi
86103

87104
"$BINARY" --model "$MAIN_MODEL" --port "$PORT" --host "$HOST" \
88105
--draft-model "$DRAFT_MODEL" \
89106
--num-draft-tokens "$NUM_DRAFT_TOKENS" \
90-
--dflash \
107+
--dflash $EXTRA_FLAGS \
91108
> "$LOG_FILE" 2>&1 &
92109
SERVER_PID=$!
93110

0 commit comments

Comments
 (0)