Skip to content

Commit 55c3e14

Browse files
author
Aegis-AI
committed
feat(turboquant): wire --turbo-kv flag into server and KVCache
Phase 2: Server.swift integration of TurboQuant KV-cache compression. CLI: --turbo-kv Enable 3-bit PolarQuant+QJL KV compression on all KVCacheSimple layers. Compresses history > 8192 tokens to ~3.5 bits/token — recommended for 100k+ context. Default: disabled (zero overhead when off). KVCache.swift (submodule): KVCacheSimple.turboQuantEnabled: Bool = false Now settable at runtime so Server.swift can activate per-request. Server.swift: - @Flag --turbo-kv added to CLI - turboKV stored in ServerConfig - Startup log shows turbo_kv=enabled/disabled - Sets .turboQuantEnabled = true on each KVCacheSimple before prefill
1 parent a623a8c commit 55c3e14

2 files changed

Lines changed: 21 additions & 3 deletions

File tree

Sources/mlx-server/Server.swift

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ struct MLXServer: AsyncParsableCommand {
192192
@Flag(name: .long, help: "Enable SSD expert streaming for MoE models (Flash-MoE style memory-mapping)")
193193
var streamExperts: Bool = false
194194

195+
@Flag(name: .long, help: "Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL). Compresses KV history > 8192 tokens to ~3.5 bits/token — recommended for 100k+ context. Default: disabled")
196+
var turboKV: Bool = false
197+
195198
@Option(name: .long, help: "Chunk size for prefill evaluation (default: 512, lower to prevent GPU timeout on large models)")
196199
var prefillSize: Int = 512
197200

@@ -397,7 +400,8 @@ struct MLXServer: AsyncParsableCommand {
397400
repeatPenalty: self.repeatPenalty,
398401
thinking: self.thinking,
399402
isVision: isVision,
400-
prefillSize: self.prefillSize
403+
prefillSize: self.prefillSize,
404+
turboKV: self.turboKV
401405
)
402406

403407
let parallelSlots = self.parallel
@@ -425,7 +429,8 @@ struct MLXServer: AsyncParsableCommand {
425429
let authStr = apiKeyValue != nil ? "enabled" : "disabled"
426430
let thinkingStr = config.thinking ? "enabled" : "disabled"
427431
let ssdStr = self.streamExperts ? "enabled" : "disabled"
428-
print("[mlx-server] Config: ctx_size=\(ctxSizeStr), temp=\(config.temp), top_p=\(config.topP), repeat_penalty=\(penaltyStr), parallel=\(parallelSlots), cors=\(corsStr), mem_limit=\(memLimitStr), auth=\(authStr), thinking=\(thinkingStr), ssd_stream=\(ssdStr)")
432+
let turboKVStr = config.turboKV ? "enabled" : "disabled"
433+
print("[mlx-server] Config: ctx_size=\(ctxSizeStr), temp=\(config.temp), top_p=\(config.topP), repeat_penalty=\(penaltyStr), parallel=\(parallelSlots), cors=\(corsStr), mem_limit=\(memLimitStr), auth=\(authStr), thinking=\(thinkingStr), ssd_stream=\(ssdStr), turbo_kv=\(turboKVStr)")
429434

430435
// ── Build Hummingbird router ──
431436
let router = Router()
@@ -647,6 +652,8 @@ struct ServerConfig: Sendable {
647652
let thinking: Bool
648653
let isVision: Bool
649654
let prefillSize: Int
655+
/// When true, each KVCacheSimple layer compresses history > 8192 tokens to 3-bit PolarQuant.
656+
let turboKV: Bool
650657
}
651658

652659
// ── Model Directory Resolution ───────────────────────────────────────────────
@@ -896,6 +903,17 @@ func handleChatCompletion(
896903
let stream: AsyncStream<Generation> = try await container.perform { context in
897904
let cache = context.model.newCache(parameters: params)
898905

906+
// ── TurboQuant: enable 3-bit KV compression on every KVCacheSimple layer ──
907+
// This compresses cache history older than 8192 tokens into 3.5-bit Polar+QJL
908+
// form, halving KV RAM for long-context (100k+) requests.
909+
if config.turboKV {
910+
for layer in cache {
911+
if let simple = layer as? KVCacheSimple {
912+
simple.turboQuantEnabled = true
913+
}
914+
}
915+
}
916+
899917
// Try to restore cached system prompt KV state
900918
if let cachedCount = await promptCache.restore(tokenHash: systemHash, into: cache) {
901919
// Cache hit: skip the cached prefix tokens, process only the rest

mlx-swift-lm

0 commit comments

Comments
 (0)