SharpAI · solderzzc · Apr 24, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -218,6 +218,99 @@ jobs:
           path: /tmp/SwiftLM-test-speculative.log
           retention-days: 7
 
+  # ── DFlash Speculative Decoding E2E ──
+  # Uses the standard macos-15 runner (7 GB RAM). 
+  dflash-speculative-decoding:
+    runs-on: macos-15
+    timeout-minutes: 45
+    needs: build_and_unit_test
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Install Metal Toolchain
+        run: xcodebuild -downloadComponent MetalToolchain || true
+
+      - name: Cache Swift packages
+        uses: actions/cache@v4
+        with:
+          path: .build
+          key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }}
+          restore-keys: |
+            ${{ runner.os }}-spm-SwiftLM-v3-
+
+      - name: Clear stale module cache
+        run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true
+
+      - name: Resolve dependencies
+        run: swift package resolve
+
+      - name: Build (Release)
+        run: swift build -c release
+
+      - name: Compile and install custom MLX Metal library
+        run: |
+          if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then
+            MLX_SRC="mlx-swift/Source/Cmlx/mlx"
+          else
+            MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx"
+          fi
+          mkdir -p .build/metallib_build
+          pushd .build/metallib_build
+          cmake "../../$MLX_SRC" \
+            -DMLX_BUILD_TESTS=OFF \
+            -DMLX_BUILD_EXAMPLES=OFF \
+            -DMLX_BUILD_BENCHMARKS=OFF \
+            -DMLX_BUILD_PYTHON_BINDINGS=OFF \
+            -DMLX_METAL_JIT=OFF \
+            -DMLX_ENABLE_NAX=1 \
+            -DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20
+          make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20
+          popd
+          BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1)
+          cp "$BUILT" .build/release/mlx.metallib
+          python3 -m venv /tmp/mlx_venv
+          /tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf
+
+      - name: Cache MLX models (dflash + main)
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: mlx-dflash-qwen35-4b
+
+      - name: Pre-download HuggingFace models
+        run: |
+          source /tmp/mlx_venv/bin/activate
+          hf download mlx-community/Qwen3.5-4B-4bit || true
+          hf download z-lab/Qwen3.5-4B-DFlash || true
+
+      - name: Run DFlash E2E
+        env:
+          HF_HUB_DOWNLOAD_TIMEOUT: "900"
+        run: |
+          chmod +x tests/test-dflash.sh
+          for attempt in 1 2 3; do
+            echo "Attempt $attempt of 3..."
+            if tests/test-dflash.sh .build/release/SwiftLM 15415; then
+              exit 0
+            fi
+            if [ "$attempt" -lt 3 ]; then
+              echo "Test failed, retrying in 10s..."
+              sleep 10
+            fi
+          done
+          echo "All attempts failed"
+          exit 1
+
+      - name: Upload dflash test logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: dflash-test-logs
+          path: /tmp/SwiftLM-test-dflash.log
+          retention-days: 7
+
   # ── Speculative Decoding Memory Evaluation ──
   # Runs the 2B model with NUM_DRAFT_TOKENS=2 to check peak
   # memory compression/efficiency. Emits vm_stat readings as step summary.

diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,6 @@ tmp/
 .agents/harness/audio-omni-gemma4/runs/
 .venv/
 mem-palace/
+
+
+tests/DFlash/intermediates/
diff --git a/Package.resolved b/Package.resolved
diff --git a/Package.swift b/Package.swift
@@ -6,8 +6,10 @@ let package = Package(
     platforms: [.macOS(.v14), .iOS(.v17)],
     products: [
         .library(name: "MLXInferenceCore", targets: ["MLXInferenceCore"]),
+        .library(name: "DFlash", targets: ["DFlash"]),
         .executable(name: "SwiftLM", targets: ["SwiftLM"]),
-        .executable(name: "SwiftBuddy", targets: ["SwiftBuddy"])
+        .executable(name: "SwiftBuddy", targets: ["SwiftBuddy"]),
+        .executable(name: "DFlashKernelBench", targets: ["DFlashKernelBench"])
     ],
     dependencies: [
         // Local Apple MLX Swift fork for C++ extensions
@@ -29,6 +31,7 @@ let package = Package(
             name: "SwiftLM",
             dependencies: [
                 "MLXInferenceCore",
+                "DFlash",
                 .product(name: "MLX", package: "mlx-swift"),
                 .product(name: "MLXLLM", package: "mlx-swift-lm"),
                 .product(name: "MLXVLM", package: "mlx-swift-lm"),
@@ -40,6 +43,16 @@ let package = Package(
             ],
             path: "Sources/SwiftLM"
         ),
+        // ── DFlash Kernel Micro-Benchmark ───────────────────────────
+        .executableTarget(
+            name: "DFlashKernelBench",
+            dependencies: [
+                "DFlash",
+                .product(name: "MLX", package: "mlx-swift"),
+                .product(name: "MLXNN", package: "mlx-swift"),
+            ],
+            path: "Sources/DFlashKernelBench"
+        ),
         // ── STFT Audio Profiling Testing Script (macOS only) ───────────
         .executableTarget(
             name: "SwiftLMTestSTFT",
@@ -86,6 +99,17 @@ let package = Package(
                 .enableExperimentalFeature("StrictConcurrency")
             ]
         ),
+        // ── DFlash Speculative Decoding ─────────────────────────────
+        .target(
+            name: "DFlash",
+            dependencies: [
+                .product(name: "MLX", package: "mlx-swift"),
+                .product(name: "MLXLLM", package: "mlx-swift-lm"),
+                .product(name: "MLXLMCommon", package: "mlx-swift-lm"),
+            ],
+            path: "Sources/DFlash",
+            exclude: ["DFlashKernelsOptimized.swift"]
+        ),
         // ── Automated Test Harness ──────────────────────────────────
         .testTarget(
             name: "SwiftBuddyTests",

diff --git a/README.md b/README.md
@@ -438,6 +438,8 @@ curl http://localhost:5413/v1/chat/completions \
 | `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression (activates after 2048 tokens, server-wide) |
 | `--draft-model` | (none) | Draft model path/ID for speculative decoding. When used with `--stream-experts`, `--num-draft-tokens` is auto-capped to 1 to minimise SSD I/O fan-out (see performance note above). |
 | `--num-draft-tokens` | `4` | Tokens per speculation round. Auto-capped to 1 when combined with `--stream-experts`. |
+| `--dflash` | `false` | Enable DFlash block-diffusion speculative decoding. Requires a compatible DFlash draft model |
+| `--dflash-block-size`| (auto) | Number of tokens per DFlash draft block. Defaults to draft model config |
 
 ## 🔧 Per-Request API Parameters
 

diff --git a/Sources/DFlash/DFlashDraftBackend.swift b/Sources/DFlash/DFlashDraftBackend.swift
@@ -0,0 +1,91 @@
+// Copyright 2026 SwiftLM Contributors
+// MIT License — see LICENSE file
+// Based on DFlash (arXiv:2602.06036)
+
+import Foundation
+import MLX
+import MLXLMCommon
+import MLXNN
+
+// MARK: - Draft Backend
+
+/// Backend for generating draft tokens using the DFlash draft model.
+public final class DFlashDraftBackend: @unchecked Sendable {
+
+    public init() {}
+
+    /// Create the draft cache (one `ContextOnlyDraftKVCache` per layer).
+    public func makeCache(
+        draftModel: DFlashDraftModel,
+        sinkSize: Int = 64,
+        windowSize: Int = 1024
+    ) -> [ContextOnlyDraftKVCache] {
+        (0 ..< draftModel.layers.count).map { _ in
+            ContextOnlyDraftKVCache(sinkSize: sinkSize, windowSize: windowSize)
+        }
+    }
+
+    /// Generate draft tokens greedily using the DFlash draft model.
+    ///
+    /// - Parameters:
+    ///   - targetModel: The target model (must conform to DFlashTargetModel for embed/lm_head access)
+    ///   - draftModel: The DFlash draft model
+    ///   - draftCache: The draft model's KV caches
+    ///   - stagedFirst: The first token (already verified by the target)
+    ///   - targetHidden: The target model's hidden states for context
+    ///   - blockLen: Number of tokens to draft
+    ///   - maskTokenTail: Mask token IDs for positions 1..blockLen-1
+    ///   - suppressTokenMask: Optional mask to suppress certain tokens
+    /// - Returns: Draft token IDs [blockLen-1]
+    public func draftGreedy(
+        targetModel: any DFlashTargetModel,
+        draftModel: DFlashDraftModel,
+        draftCache: [ContextOnlyDraftKVCache],
+        stagedFirst: MLXArray,
+        targetHidden: MLXArray,
+        blockLen: Int,
+        maskTokenTail: MLXArray,
+        suppressTokenMask: MLXArray? = nil
+    ) -> MLXArray {
+        precondition(blockLen > 1, "draftGreedy requires blockLen > 1")
+
+        let blockTokenIDs = concatenated(
+            [stagedFirst[..<1], maskTokenTail[..<(blockLen - 1)]],
+            axis: 0
+        )
+
+        // Get noise embedding from target model's embed_tokens
+        let noiseEmbedding = targetModel.dflashEmbedTokens(blockTokenIDs[.newAxis])
+        if DFlashDumper.isEnabled {
+            DFlashDumper.saveInt("swift_block_token_ids", blockTokenIDs[.newAxis])
+            DFlashDumper.save("swift_noise_embedding", noiseEmbedding)
+        }
+
+        // Run the draft model
+        let draftHidden = draftModel(
+            noiseEmbedding: noiseEmbedding,
+            targetHidden: targetHidden,
+            cache: draftCache
+        )
+        if DFlashDumper.isEnabled {
+            DFlashDumper.save("swift_draft_hidden", draftHidden)
+        }
+
+        // Get draft logits via the target model's lm_head
+        let draftLogits = targetModel.dflashLmHeadLogits(
+            draftHidden[.ellipsis, 1..., 0...]
+        )
+        if DFlashDumper.isEnabled {
+            DFlashDumper.save("swift_draft_logits", draftLogits)
+        }
+
+        // Greedy decode
+        let drafted = DFlashRuntime.greedyTokensWithMask(
+            logits: draftLogits,
+            suppressTokenMask: suppressTokenMask
+        ).squeezed(axis: 0)
+
+        asyncEval(drafted)
+        return drafted
+    }
+}