fix: prevent Metal GPU Watchdog timeout on low-RAM CI runners

github-actions[bot] · github-actions[bot] · commit 2707be9eb335 · 2026-04-24T13:41:34.000-07:00
- Move MLX_MAX_OPS_PER_BUFFER=50 to top of run() before Metal init
- Enable --stream-experts automatically on &lt;12GB machines in test-dflash.sh
  so weights are paged via mmap/pread instead of macOS VM swap
- Auto-cap draft tokens to 1 under SSD streaming (minimal fan-out)
- Always compute draftFootprintBytes regardless of --stream-experts flag
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
@@ -290,6 +290,15 @@ struct MLXServer: AsyncParsableCommand {
             setrlimit(RLIMIT_NOFILE, &rl)
         }
 
+        // Cap Metal command buffer size BEFORE any MLX operation to prevent the
+        // 5-second Apple GPU Watchdog from killing processes under swap pressure.
+        // This env var must be set before MLX's Metal backend initializes.
+        // Value 50 splits large computation graphs into ~1-layer chunks so macOS
+        // can page in weights incrementally without exceeding the watchdog timeout.
+        if self.draftModel != nil || self.streamExperts {
+            setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1)
+        }
+
         // Register SwiftLM-owned DFlash model types before any model loading.
         await registerDFlashModelTypes()
 
@@ -467,7 +476,6 @@ struct MLXServer: AsyncParsableCommand {
                     print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
                 } else {
                     Memory.cacheLimit = plan.recommendedCacheLimit
-                    setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
                     print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: SWAP-ASSISTED (\(String(format: "%.1f", plan.overcommitRatio))× overcommit, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
                     for w in plan.warnings { print("[SwiftLM]    \(w)") }
                 }
@@ -479,7 +487,6 @@ struct MLXServer: AsyncParsableCommand {
                     print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
                 } else {
                     Memory.cacheLimit = plan.recommendedCacheLimit
-                    setenv("MLX_MAX_OPS_PER_BUFFER", "50", 1) // Cap buffer size to avoid 5s Metal GPU Watchdog during SSD swap
                     print("[SwiftLM] \(plan.strategy.emoji) Memory strategy: LAYER PARTITIONED (\(plan.recommendedGPULayers)/\(plan.totalLayers) GPU layers, cache limited to \(plan.recommendedCacheLimit / (1024*1024))MB)")
                     for w in plan.warnings { print("[SwiftLM]    \(w)") }
                 }
diff --git a/tests/test-dflash.sh b/tests/test-dflash.sh
@@ -66,9 +66,23 @@ fi
 TOTAL_RAM_GB=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f", $1 / 1073741824}')
 log "System RAM: ${TOTAL_RAM_GB} GB"
 
-if [ "$TOTAL_RAM_GB" -lt 8 ] 2>/dev/null; then
-    log "⚠️  WARNING: ${TOTAL_RAM_GB} GB RAM detected. Dual-model test requires ~6 GB."
-    log "   Consider running on a machine with ≥8 GB RAM."
+# On low-RAM machines (< 12 GB), the combined main + draft model weights
+# (~6 GB) exceed available memory after OS reservation.  Without SSD
+# streaming, all weights must be GPU-resident or swapped via macOS VM,
+# which causes Metal command buffers to exceed Apple's 5-second GPU
+# Watchdog timeout → Abort trap: 6.
+#
+# Fix: enable --stream-experts on low-RAM machines.  This uses mmap-based
+# weight loading (pread from SSD via the OS page cache) so the GPU never
+# stalls waiting for swap.  Draft tokens are auto-capped to 1 server-side
+# to minimise SSD I/O fan-out during the verify pass.
+EXTRA_FLAGS=""
+if [ "$TOTAL_RAM_GB" -lt 12 ] 2>/dev/null; then
+    log "⚠️  ${TOTAL_RAM_GB} GB RAM: enabling --stream-experts for SSD-backed weight paging"
+    log "   Combined model weights (~6 GB) exceed available RAM. SSD streaming prevents"
+    log "   Metal GPU Watchdog timeouts during DFlash verify passes."
+    EXTRA_FLAGS="--stream-experts"
+    NUM_DRAFT_TOKENS=1  # auto-capped server-side too, but be explicit
 fi
 
 # ══════════════════════════════════════════════════════════════════════
@@ -83,11 +97,14 @@ log "Starting server with DFlash speculative decoding..."
 log "  Main model:  $MAIN_MODEL"
 log "  Draft model: $DRAFT_MODEL"
 log "  Draft tokens per round: $NUM_DRAFT_TOKENS"
+if [ -n "$EXTRA_FLAGS" ]; then
+    log "  Extra flags: $EXTRA_FLAGS"
+fi
 
 "$BINARY" --model "$MAIN_MODEL" --port "$PORT" --host "$HOST" \
     --draft-model "$DRAFT_MODEL" \
     --num-draft-tokens "$NUM_DRAFT_TOKENS" \
-    --dflash \
+    --dflash $EXTRA_FLAGS \
     > "$LOG_FILE" 2>&1 &
 SERVER_PID=$!