Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
1040e68
feat: add initial dflash implementation
0xClandestine Apr 21, 2026
e1ea48f
fix(dflash): load hiddenNorm weight + streaming + prefetch + asyncEval
0xClandestine Apr 21, 2026
7820436
feat: selective safetensors loader — skip expert weight data with SSD…
0xClandestine Apr 21, 2026
9b91b4d
feat: add timings (tok/s, token count, duration) to all API responses
0xClandestine Apr 21, 2026
d6fdef4
feat: add bench_35b.sh benchmark script
0xClandestine Apr 21, 2026
485a929
feat: add Qwen3Next SSD streaming + DFlash support
0xClandestine Apr 21, 2026
f2ab918
refactor(dflash/kernels): branchless mask via metal::select + 2D kern…
0xClandestine Apr 23, 2026
464b959
feat(dflash): add MambaSnapshotCache + dflashUseTapeRollback protocol…
0xClandestine Apr 23, 2026
a2c8102
feat: add DFlashKernelBench micro-benchmark target
0xClandestine Apr 23, 2026
0d96a5e
feat(bench): add JSON result export to bench_35b.sh; add bench_coder_…
0xClandestine Apr 23, 2026
108f0c2
test: reorganize DFlash test suite into tests/DFlash/
0xClandestine Apr 23, 2026
7d150f9
refactor(Qwen3Next): move DFlashTargetModel conformance to SwiftLM ex…
0xClandestine Apr 23, 2026
c680a47
Merge remote-tracking branch 'upstream/main' into feat/add-dflash
0xClandestine Apr 23, 2026
a52bd07
fix: resolve DFlash protocol conformance and build blockers
github-actions[bot] Apr 23, 2026
2ea4e96
fix: address Copilot review on PR #78
0xClandestine Apr 23, 2026
602f940
fix(bench): increase server wait timeout to 3600s to allow large mode…
github-actions[bot] Apr 23, 2026
6f0c670
docs: add DFlash parameters to README CLI options list
github-actions[bot] Apr 23, 2026
7dcdaf4
chore: bump mlx-swift-lm submodule to b447
github-actions[bot] Apr 23, 2026
60d88e4
fix: restore DFlashRollbackCache protocol and clean dead extension
github-actions[bot] Apr 23, 2026
0360ea9
Merge remote-tracking branch 'origin/main' into pr-78
github-actions[bot] Apr 23, 2026
f629f63
test(dflash): fix submodule pin and add E2E tests
github-actions[bot] Apr 23, 2026
7e7ccd1
fix(benchmark): exit early on DFlash tests to avoid model prompt
github-actions[bot] Apr 23, 2026
fd84f80
chore: move dflash benchmark scripts to profiling dir
github-actions[bot] Apr 23, 2026
5553bf5
fix: disable prompt cache for MambaCache hybrid models (Qwen3Next)
github-actions[bot] Apr 23, 2026
2d537d6
fix: use SUITE_OPT env var to bypass menu in matrix sub-processes
github-actions[bot] Apr 23, 2026
0dba57a
fix: suppress interactive menu in sub-process invocations
github-actions[bot] Apr 23, 2026
b7dcd53
fix: remove stray banner echo outside SUITE_OPT guard
github-actions[bot] Apr 23, 2026
5581f38
fix: add 'Using speculative decoding' log line for CI test assertions
github-actions[bot] Apr 23, 2026
4c042a6
fix: add required log lines to DFlash draft model load path
github-actions[bot] Apr 24, 2026
069a75f
feat: add DFlashTargetModel conformance for Qwen3, Qwen3MoE, and Llama
0xClandestine Apr 24, 2026
9fc993c
fix(ci): skip omni test gracefully when RAM is insufficient
github-actions[bot] Apr 24, 2026
b224692
Revert "fix(ci): skip omni test gracefully when RAM is insufficient"
github-actions[bot] Apr 24, 2026
313fa91
feat: add DeepSeek V3 and Kimi Linear DFlash support (Option B)
0xClandestine Apr 24, 2026
0e79358
fix: resolve CI GPU timeouts on 7GB runners by fixing Memory limit sp…
github-actions[bot] Apr 24, 2026
13505e6
Merge 313fa91 from clandestine
github-actions[bot] Apr 24, 2026
d6bcf66
fix: correct weight key paths for DeepseekV3 and KimiLinear models
0xClandestine Apr 24, 2026
b5037f6
fix: strip language_model. prefix, remove stale expert keys, raise FD…
0xClandestine Apr 24, 2026
91e32af
fix: cap Metal command buffer size during swap-assisted inference to …
github-actions[bot] Apr 24, 2026
2707be9
fix: prevent Metal GPU Watchdog timeout on low-RAM CI runners
github-actions[bot] Apr 24, 2026
65d74a9
Merge origin/main to resolve conflicts
github-actions[bot] Apr 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,99 @@ jobs:
path: /tmp/SwiftLM-test-speculative.log
retention-days: 7

# ── DFlash Speculative Decoding E2E ──
# Uses the standard macos-15 runner (7 GB RAM).
dflash-speculative-decoding:
runs-on: macos-15
timeout-minutes: 45
needs: build_and_unit_test
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: Install Metal Toolchain
run: xcodebuild -downloadComponent MetalToolchain || true

- name: Cache Swift packages
uses: actions/cache@v4
with:
path: .build
key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }}
restore-keys: |
${{ runner.os }}-spm-SwiftLM-v3-

- name: Clear stale module cache
run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true

- name: Resolve dependencies
run: swift package resolve

- name: Build (Release)
run: swift build -c release

- name: Compile and install custom MLX Metal library
run: |
if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then
MLX_SRC="mlx-swift/Source/Cmlx/mlx"
else
MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx"
fi
mkdir -p .build/metallib_build
pushd .build/metallib_build
cmake "../../$MLX_SRC" \
-DMLX_BUILD_TESTS=OFF \
-DMLX_BUILD_EXAMPLES=OFF \
-DMLX_BUILD_BENCHMARKS=OFF \
-DMLX_BUILD_PYTHON_BINDINGS=OFF \
-DMLX_METAL_JIT=OFF \
-DMLX_ENABLE_NAX=1 \
-DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20
make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20
popd
BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1)
cp "$BUILT" .build/release/mlx.metallib
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf

- name: Cache MLX models (dflash + main)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-dflash-qwen35-4b

- name: Pre-download HuggingFace models
run: |
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-4B-4bit || true
hf download z-lab/Qwen3.5-4B-DFlash || true

- name: Run DFlash E2E
env:
HF_HUB_DOWNLOAD_TIMEOUT: "900"
run: |
chmod +x tests/test-dflash.sh
for attempt in 1 2 3; do
echo "Attempt $attempt of 3..."
if tests/test-dflash.sh .build/release/SwiftLM 15415; then
exit 0
fi
if [ "$attempt" -lt 3 ]; then
echo "Test failed, retrying in 10s..."
sleep 10
fi
done
echo "All attempts failed"
exit 1

- name: Upload dflash test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: dflash-test-logs
path: /tmp/SwiftLM-test-dflash.log
retention-days: 7

# ── Speculative Decoding Memory Evaluation ──
# Runs the 2B model with NUM_DRAFT_TOKENS=2 to check peak
# memory compression/efficiency. Emits vm_stat readings as step summary.
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ tmp/
.agents/harness/audio-omni-gemma4/runs/
.venv/
mem-palace/


tests/DFlash/intermediates/
44 changes: 22 additions & 22 deletions Package.resolved

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 25 additions & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ let package = Package(
platforms: [.macOS(.v14), .iOS(.v17)],
products: [
.library(name: "MLXInferenceCore", targets: ["MLXInferenceCore"]),
.library(name: "DFlash", targets: ["DFlash"]),
.executable(name: "SwiftLM", targets: ["SwiftLM"]),
.executable(name: "SwiftBuddy", targets: ["SwiftBuddy"])
.executable(name: "SwiftBuddy", targets: ["SwiftBuddy"]),
.executable(name: "DFlashKernelBench", targets: ["DFlashKernelBench"])
],
dependencies: [
// Local Apple MLX Swift fork for C++ extensions
Expand All @@ -29,6 +31,7 @@ let package = Package(
name: "SwiftLM",
dependencies: [
"MLXInferenceCore",
"DFlash",
.product(name: "MLX", package: "mlx-swift"),
.product(name: "MLXLLM", package: "mlx-swift-lm"),
.product(name: "MLXVLM", package: "mlx-swift-lm"),
Expand All @@ -40,6 +43,16 @@ let package = Package(
],
path: "Sources/SwiftLM"
),
// ── DFlash Kernel Micro-Benchmark ───────────────────────────
.executableTarget(
name: "DFlashKernelBench",
dependencies: [
"DFlash",
.product(name: "MLX", package: "mlx-swift"),
.product(name: "MLXNN", package: "mlx-swift"),
],
path: "Sources/DFlashKernelBench"
),
// ── STFT Audio Profiling Testing Script (macOS only) ───────────
.executableTarget(
name: "SwiftLMTestSTFT",
Expand Down Expand Up @@ -86,6 +99,17 @@ let package = Package(
.enableExperimentalFeature("StrictConcurrency")
]
),
// ── DFlash Speculative Decoding ─────────────────────────────
.target(
name: "DFlash",
dependencies: [
.product(name: "MLX", package: "mlx-swift"),
.product(name: "MLXLLM", package: "mlx-swift-lm"),
.product(name: "MLXLMCommon", package: "mlx-swift-lm"),
],
path: "Sources/DFlash",
exclude: ["DFlashKernelsOptimized.swift"]
),
// ── Automated Test Harness ──────────────────────────────────
.testTarget(
name: "SwiftBuddyTests",
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ curl http://localhost:5413/v1/chat/completions \
| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression (activates after 2048 tokens, server-wide) |
| `--draft-model` | (none) | Draft model path/ID for speculative decoding. When used with `--stream-experts`, `--num-draft-tokens` is auto-capped to 1 to minimise SSD I/O fan-out (see performance note above). |
| `--num-draft-tokens` | `4` | Tokens per speculation round. Auto-capped to 1 when combined with `--stream-experts`. |
| `--dflash` | `false` | Enable DFlash block-diffusion speculative decoding. Requires a compatible DFlash draft model |
| `--dflash-block-size`| (auto) | Number of tokens per DFlash draft block. Defaults to draft model config |

## 🔧 Per-Request API Parameters

Expand Down
91 changes: 91 additions & 0 deletions Sources/DFlash/DFlashDraftBackend.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2026 SwiftLM Contributors
// MIT License — see LICENSE file
// Based on DFlash (arXiv:2602.06036)

import Foundation
import MLX
import MLXLMCommon
import MLXNN

// MARK: - Draft Backend

/// Backend for generating draft tokens using the DFlash draft model.
public final class DFlashDraftBackend: @unchecked Sendable {

public init() {}

/// Create the draft cache (one `ContextOnlyDraftKVCache` per layer).
public func makeCache(
draftModel: DFlashDraftModel,
sinkSize: Int = 64,
windowSize: Int = 1024
) -> [ContextOnlyDraftKVCache] {
(0 ..< draftModel.layers.count).map { _ in
ContextOnlyDraftKVCache(sinkSize: sinkSize, windowSize: windowSize)
}
}

/// Generate draft tokens greedily using the DFlash draft model.
///
/// - Parameters:
/// - targetModel: The target model (must conform to DFlashTargetModel for embed/lm_head access)
/// - draftModel: The DFlash draft model
/// - draftCache: The draft model's KV caches
/// - stagedFirst: The first token (already verified by the target)
/// - targetHidden: The target model's hidden states for context
/// - blockLen: Number of tokens to draft
/// - maskTokenTail: Mask token IDs for positions 1..blockLen-1
/// - suppressTokenMask: Optional mask to suppress certain tokens
/// - Returns: Draft token IDs [blockLen-1]
public func draftGreedy(
targetModel: any DFlashTargetModel,
draftModel: DFlashDraftModel,
draftCache: [ContextOnlyDraftKVCache],
stagedFirst: MLXArray,
targetHidden: MLXArray,
blockLen: Int,
maskTokenTail: MLXArray,
suppressTokenMask: MLXArray? = nil
) -> MLXArray {
precondition(blockLen > 1, "draftGreedy requires blockLen > 1")

let blockTokenIDs = concatenated(
[stagedFirst[..<1], maskTokenTail[..<(blockLen - 1)]],
axis: 0
)

// Get noise embedding from target model's embed_tokens
let noiseEmbedding = targetModel.dflashEmbedTokens(blockTokenIDs[.newAxis])
if DFlashDumper.isEnabled {
DFlashDumper.saveInt("swift_block_token_ids", blockTokenIDs[.newAxis])
DFlashDumper.save("swift_noise_embedding", noiseEmbedding)
}

// Run the draft model
let draftHidden = draftModel(
noiseEmbedding: noiseEmbedding,
targetHidden: targetHidden,
cache: draftCache
)
if DFlashDumper.isEnabled {
DFlashDumper.save("swift_draft_hidden", draftHidden)
}

// Get draft logits via the target model's lm_head
let draftLogits = targetModel.dflashLmHeadLogits(
draftHidden[.ellipsis, 1..., 0...]
)
if DFlashDumper.isEnabled {
DFlashDumper.save("swift_draft_logits", draftLogits)
}

// Greedy decode
let drafted = DFlashRuntime.greedyTokensWithMask(
logits: draftLogits,
suppressTokenMask: suppressTokenMask
).squeezed(axis: 0)

asyncEval(drafted)
return drafted
}
}
Loading