From d38fe8ed543d29d34e704905b4d595503ef382de Mon Sep 17 00:00:00 2001
From: Eric <eric@tendwell.life>
Date: Wed, 22 Apr 2026 04:05:04 -0700
Subject: [PATCH 1/2] Re-apply prompt-cache bleed fixes to synced main
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes, now riding on upstream 116ee91:

1. save(): slice KVCacheSimple state T-dim down to P=tokens.count so the
   cached states' T matches cached.tokens.count. Prevents the over-allocated
   prefill buffer from carrying uninitialized tokens past the valid prefix.

2. restore(): gate out recurrent-state layers (MambaCache and friends) up
   front. Their state is 2-D with no T dimension, so the dim(2) read in the
   pre-flight check would crash; also there's no trim(excess) operator for
   a recurrent hidden state — we can't partial-restore one safely. Guard
   with ndim>=3 inside the min-length scan too for belt-and-suspenders.

3. handleChatCompletion(): reorder the decision branch so speculative
   decoding is checked BEFORE the prompt cache restore. A cache-hit rollback
   corrupts the draft model's KV state (draft and main cycle tokens in
   lock-step), so when draftModelRef is set we bypass the cache entirely
   and pay the full prefill. Partial-match restores stay available on the
   non-spec path where they still pay off.
---
 Sources/SwiftLM/Server.swift | 56 +++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
index 17d68d37..f06ca258 100644
--- a/Sources/SwiftLM/Server.swift
+++ b/Sources/SwiftLM/Server.swift
@@ -946,7 +946,24 @@ actor PromptCache {
     /// If not materialized now, those lazy references point to the live cache tensors
     /// which get overwritten by subsequent requests, causing stale data / SIGTRAP on restore.
     func save(tokens: [Int], cache: [KVCache]) {
-        let states = cache.map { $0.state }
+        let P = tokens.count
+        // For attention KVCacheSimple layers, the state tensor is [B, H, T, D] with a
+        // pre-allocated T that can exceed the actual prompt length P. If we store the
+        // full over-sized buffer, restore()'s trim() by (cached.tokens.count - matchLen)
+        // still leaves T - P slots of garbage beyond the valid prefix. Slice T to P at
+        // save time so cached.tokens.count === cached state's T.
+        let states: [[MLXArray]] = cache.map { layer -> [MLXArray] in
+            let s = layer.state
+            if layer is KVCacheSimple {
+                return s.map { arr -> MLXArray in
+                    guard arr.ndim >= 3 else { return arr }
+                    let T = arr.dim(2)
+                    if T > P { return arr[.ellipsis, ..<P, 0...] }
+                    return arr
+                }
+            }
+            return s
+        }
         let metaStates = cache.map { $0.metaState }
         // Materialize all lazy MLX arrays so they survive cache mutations
         let allArrays = states.flatMap { $0 }
@@ -964,6 +981,20 @@ actor PromptCache {
             misses += 1
             return nil
         }
+        // ── Recurrent-layer safety gate ──
+        // MambaCache (and other recurrent caches) store a 2-D hidden state with no
+        // T dimension, so the dim(2) read below would crash. Hybrid Mamba/attention
+        // models (Qwen-Next, Mamba-2, etc.) can't be safely prefix-restored because
+        // the recurrent hidden state was computed over the WHOLE previous sequence
+        // and there is no trim(excess) operator for it. Treat any cache containing
+        // a recurrent layer as a miss before we touch anything.
+        let hasRecurrentLayer = cache.contains { layer in
+            !(layer is KVCacheSimple) && !(String(describing: type(of: layer)).contains("Rotating"))
+        }
+        if hasRecurrentLayer {
+            misses += 1
+            return nil
+        }
         // Token-by-token longest common prefix scan
         var matchLen = 0
         for (a, b) in zip(cached.tokens, newTokens) {
@@ -984,6 +1015,7 @@ actor PromptCache {
             // dim(2) = T = the number of cached tokens for that layer.
             let minCachedSeqLen = cached.states.map { arrays -> Int in
                 guard let firstArray = arrays.first else { return 0 }
+                guard firstArray.ndim >= 3 else { return 0 }
                 return firstArray.dim(2)  // T dimension
             }.min() ?? 0
             if excess >= minCachedSeqLen {
@@ -1200,9 +1232,20 @@ func handleChatCompletion(
         // raw <|image|>/<|audio|> token embeddings instead of the projected features.
         let isMultimodalRequest = lmInput.image != nil || lmInput.audio != nil
 
-        // Try to restore via token-by-token prefix match (llama-server style)
+        // ── Decision branch ──
+        // Speculative decoding is CHECKED FIRST because a cache-hit rollback
+        // corrupts the draft model's KV state (draft and main model cycle tokens
+        // in lock-step). We'd rather pay the prefill than emit garbage.
         var stream: AsyncStream<Generation>
-        if !isMultimodalRequest, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
+        if let draftRef = draftModelRef {
+            // Speculative decoding path: draft model generates candidates, main model verifies.
+            // Bypass prompt cache to avoid draft/main KV drift on partial-match restores.
+            print("[SwiftLM] Using speculative decoding (\(numDraftTokens) draft tokens/round)")
+            stream = try MLXLMCommon.generate(
+                input: lmInput, cache: cache, parameters: params, context: context,
+                draftModel: draftRef.model, numDraftTokens: numDraftTokens
+            )
+        } else if !isMultimodalRequest, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
             // Cache hit: KV state is pre-populated up to cachedCount tokens.
             // Only compute the remaining (new) tokens.
             var startIndex = cachedCount
@@ -1218,13 +1261,6 @@ func handleChatCompletion(
             stream = try MLXLMCommon.generate(
                 input: trimmedInput, cache: cache, parameters: params, context: context
             )
-        } else if let draftRef = draftModelRef {
-            // Speculative decoding path: draft model generates candidates, main model verifies
-            print("[SwiftLM] Using speculative decoding (\(numDraftTokens) draft tokens/round)")
-            stream = try MLXLMCommon.generate(
-                input: lmInput, cache: cache, parameters: params, context: context,
-                draftModel: draftRef.model, numDraftTokens: numDraftTokens
-            )
         } else {
             // Cache miss: process the full prompt.
             stream = try MLXLMCommon.generate(

From 5a5b82ab529a5b2df181d7cbccf61f12c68bb5a2 Mon Sep 17 00:00:00 2001
From: ericjlake <ericjlake@gmail.com>
Date: Sat, 25 Apr 2026 21:16:57 -0700
Subject: [PATCH 2/2] docs(README): add Qwen3-A3B full-RAM perf table on M1
 Ultra 64 GB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new Performance subsection covering full-RAM Qwen3.6-35B-A3B-UD-MLX-4bit
inference on M1 Ultra 64 GB:
- Vanilla full-GPU (62 tok/s) — post needsMoeFlush gate (SwiftLM #84)
- DFlash spec decode with z-lab/Qwen3.6-35B-A3B-DFlash (+13% medium/long,
  -15% short due to block overhead, finish_reason behavior changes)

Includes 19→62 tok/s before/after reference for the gate fix.
---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 9bf4bacd..16ec453a 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,21 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB
 
 > Run `./run_benchmark.sh` to generate these metrics on your own device. (See **Benchmarks & Testing** below).
 
+### Qwen3.6-35B-A3B-UD-MLX-4bit (Full-RAM) — M1 Ultra 64 GB
+
+Benchmark results for full-RAM (no SSD streaming) MoE inference on M1 Ultra. The 3.4× vanilla improvement vs. earlier builds comes from the `needsMoeFlush` gate in `mlx-swift-lm` (see [SwiftLM #84](https://github.com/SharpAI/SwiftLM/issues/84)) — the per-layer GPU sync barrier required for SSD streaming was firing unconditionally on the full-RAM path and flushing MLX's kernel-batching pipeline.
+
+| Configuration | Short (~126 tok) | Medium (~400 tok) | Long (~800 tok) |
+|---|---|---|---|
+| **Vanilla full-GPU** | **61.7 tok/s** | **62.3 tok/s** | **62.1 tok/s** |
+| `--dflash` (block_size=16) † | 52.3 tok/s | **70.3 tok/s** (+13%) | **69.9 tok/s** (+13%) |
+
+> *Hardware:* Apple M1 Ultra, 64 GB unified memory, macOS 26.x. Model ~20 GB on disk, ~21.6 GB resident weight + ~2.1 GB KV at runtime.
+> *Flags:* `--repeat-penalty 1.1 --max-tokens 2000`, `temperature: 0.6`, single-stream `/v1/chat/completions`.
+> *Vanilla baseline before* `needsMoeFlush` *gate (for reference):* 19.2 / 18.1 / 18.3 tok/s — see #84.
+
+† DFlash uses [`z-lab/Qwen3.6-35B-A3B-DFlash`](https://huggingface.co/z-lab/Qwen3.6-35B-A3B-DFlash) (~948 MB) as the block-diffusion draft model. DFlash gives a clean +13% on medium/long generations but regresses short prompts (block overhead doesn't amortize at low token counts) and changes stop-condition behavior (`finish_reason=null` vs `stop`/`length`). Recommend a quality eval before using as default.
+
 ---
 
 ## 🚀 Features