From 3f1069241e9406943ae00fba604c8745760683a4 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 21 May 2026 18:00:46 +0800 Subject: [PATCH 1/3] feat(dflash): add --lazy-draft to C++ server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Park the decode draft model (~3.3 GB) when idle to free VRAM for pflash compression. Before generate, free the pflash drafter and unpark the decode draft; after generate, park draft again. Flow: startup → park draft | request → compress → free pflash drafter → unpark draft → generate → park draft Saves ~3.3 GB VRAM on idle, enabling longer context on 22 GB GPUs. Port of Python server.py --lazy-draft behavior to the C++ in-process server. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dflash/src/server/http_server.cpp | 11 +++++++++++ dflash/src/server/http_server.h | 1 + dflash/src/server/server_main.cpp | 12 ++++++++++++ 3 files changed, 24 insertions(+) diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index 5d61da30..c0e16ace 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -712,6 +712,12 @@ void HttpServer::worker_loop() { }; // Run generation (with or without restore). + // Lazy-draft: ensure decode draft is loaded before generate. + if (config_.lazy_draft) { + backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded + backend_.unpark("draft"); // reload decode draft (~3.3 GB) + } + GenerateResult result; if (using_restore) { result = backend_.restore_and_generate(cache_slot, gen_req, io); @@ -719,6 +725,11 @@ void HttpServer::worker_loop() { result = backend_.generate(gen_req, io); } + // Lazy-draft: park decode draft after generate to free VRAM. + if (config_.lazy_draft) { + backend_.park("draft"); + } + // Confirm or abort the inline snapshot. if (snap_prepared) { if (completion_tokens > 0 && !client_disconnected) { diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index 8c0ec9eb..73d1e552 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -56,6 +56,7 @@ struct ServerConfig { float pflash_keep_ratio = 0.05f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs + bool lazy_draft = true; // park decode draft when idle to save VRAM // Disk prefix cache std::string disk_cache_dir; // empty = disabled diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index 319f97de..a666e849 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -68,6 +68,7 @@ static void print_usage(const char * prog) { " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" + " --no-lazy-draft Keep decode draft loaded at all times\n" "\n" "Disk KV cache:\n" " --kv-cache-dir Directory for ondisk KV cache (enables feature)\n" @@ -140,6 +141,8 @@ int main(int argc, char ** argv) { sconfig.pflash_drafter_path = argv[++i]; } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) { sconfig.pflash_skip_park = true; + } else if (std::strcmp(argv[i], "--no-lazy-draft") == 0) { + sconfig.lazy_draft = false; } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { sconfig.disk_cache_dir = argv[++i]; } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) { @@ -269,6 +272,9 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off"); std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)"); } + if (bargs.draft_path) { + std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off"); + } std::fprintf(stderr, "[server] ╰─────────────────────────────────────────────────────╯\n\n"); HttpServer server(*backend, tokenizer, sconfig); @@ -278,6 +284,12 @@ int main(int argc, char ** argv) { if (pflash_enabled) { server.set_drafter_tokenizer(&drafter_tokenizer); } + + // Lazy-draft: park decode draft at startup to free VRAM (~3.3 GB). + if (sconfig.lazy_draft && bargs.draft_path) { + backend->park("draft"); + } + int ret = server.run(); // Cleanup. From 33e35a7564c5000e0189c61d4e043bb4547cd33e Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 21 May 2026 20:09:13 +0800 Subject: [PATCH 2/3] fix: release scratch VRAM buffers between requests The target gallocr, LM-head projection gallocr, and BSA persistent CUDA buffers grow monotonically with request size but never shrink. After a large-prompt request (e.g. agent 2k tokens), subsequent smaller requests suffer VRAM pressure causing KV cache spill to system RAM and ~2x decode slowdown. Add ModelBackend::release_scratch() called after each HTTP request completes. Qwen35Backend implementation frees: - sg_.alloc (target graph allocator) - proj_sg_.alloc (LM-head projection allocator) - BSA persistent device buffers (blockmask, head_mask_type, softmax_lse) All are lazily recreated at the exact size needed on the next request. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dflash/src/common/model_backend.h | 4 ++++ dflash/src/qwen35/qwen35_backend.cpp | 28 ++++++++++++++++++++++++++++ dflash/src/qwen35/qwen35_backend.h | 4 ++++ dflash/src/server/http_server.cpp | 4 ++++ 4 files changed, 40 insertions(+) diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h index 504b68eb..6eb5b74a 100644 --- a/dflash/src/common/model_backend.h +++ b/dflash/src/common/model_backend.h @@ -174,6 +174,10 @@ struct ModelBackend { // supports_dflash_spec_decode() returns true. Default returns nullptr. virtual class DFlashTarget * dflash_target() { return nullptr; } + // Release oversized scratch buffers between requests to prevent VRAM + // growth over time. Default is a no-op. + virtual void release_scratch() {} + // ── Cleanup ────────────────────────────────────────────────────── // Release all resources (weights, cache, snapshots, drafter). // Called by run_daemon() before returning. diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp index 8b08d69e..da49d9a0 100644 --- a/dflash/src/qwen35/qwen35_backend.cpp +++ b/dflash/src/qwen35/qwen35_backend.cpp @@ -13,6 +13,8 @@ #include "ggml-cuda.h" #include "common/snapshot_backend.h" +#include "pflash_ggml_adapter.h" +#include "flashprefill.h" #include #include @@ -436,6 +438,32 @@ void Qwen35Backend::shutdown() { } } +// ── Release scratch buffers between requests ──────────────────────────── + +void Qwen35Backend::release_scratch() { + // Target graph allocator: grows during large prefill batches, not needed + // between requests. Will be lazily recreated on next build_target_step(). + if (sg_.alloc) { + ggml_gallocr_free(sg_.alloc); + sg_.alloc = nullptr; + } + step_graph_free(sg_); + + // LM-head projection allocator (same pattern). + if (proj_sg_.alloc) { + ggml_gallocr_free(proj_sg_.alloc); + proj_sg_.alloc = nullptr; + } + step_graph_free(proj_sg_); + + // BSA persistent CUDA buffers (blockmask, head_mask_type, softmax_lse). +#ifdef DFLASH27B_HAVE_BSA + flashprefill::dflash_bsa_free_persistent(); +#endif + + std::fprintf(stderr, "[vram] released scratch buffers\n"); +} + // ── Generate (speculative decode) ─────────────────────────────────────── GenerateResult Qwen35Backend::generate(const GenerateRequest & req, diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h index d87f5f0b..f94f6ef4 100644 --- a/dflash/src/qwen35/qwen35_backend.h +++ b/dflash/src/qwen35/qwen35_backend.h @@ -109,6 +109,10 @@ class Qwen35Backend : public ModelBackend { void shutdown() override; + // Release oversized scratch buffers (gallocr, BSA cache) between requests + // to prevent VRAM growth over time. + void release_scratch() override; + private: // ── Configuration ──────────────────────────────────────────────── Qwen35Config cfg_; diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index c0e16ace..fb3d3b61 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -730,6 +730,10 @@ void HttpServer::worker_loop() { backend_.park("draft"); } + // Release oversized scratch buffers (gallocr, BSA cache) so VRAM + // doesn't grow monotonically across requests with different sizes. + backend_.release_scratch(); + // Confirm or abort the inline snapshot. if (snap_prepared) { if (completion_tokens > 0 && !client_disconnected) { From 88d5b6243ef5b5696a94f2d2d9ec43c5dd0e1c4d Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 22 May 2026 11:39:51 +0800 Subject: [PATCH 3/3] Make lazy-draft default to off --- dflash/src/server/http_server.h | 2 +- dflash/src/server/server_main.cpp | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index 73d1e552..24d075d8 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -56,7 +56,7 @@ struct ServerConfig { float pflash_keep_ratio = 0.05f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs - bool lazy_draft = true; // park decode draft when idle to save VRAM + bool lazy_draft = false; // park decode draft when idle to save VRAM // Disk prefix cache std::string disk_cache_dir; // empty = disabled diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index a666e849..627e4c64 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -68,7 +68,7 @@ static void print_usage(const char * prog) { " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" - " --no-lazy-draft Keep decode draft loaded at all times\n" + " --lazy-draft Park decode draft when idle to save VRAM\n" "\n" "Disk KV cache:\n" " --kv-cache-dir Directory for ondisk KV cache (enables feature)\n" @@ -141,8 +141,8 @@ int main(int argc, char ** argv) { sconfig.pflash_drafter_path = argv[++i]; } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) { sconfig.pflash_skip_park = true; - } else if (std::strcmp(argv[i], "--no-lazy-draft") == 0) { - sconfig.lazy_draft = false; + } else if (std::strcmp(argv[i], "--lazy-draft") == 0) { + sconfig.lazy_draft = true; } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { sconfig.disk_cache_dir = argv[++i]; } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) { @@ -197,6 +197,12 @@ int main(int argc, char ** argv) { setenv("DFLASH27B_FA_WINDOW", "0", 0); } + // Lazy-draft requires both prefill-drafter AND decode draft to be useful. + if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) { + std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n"); + sconfig.lazy_draft = false; + } + // Load tokenizer. std::fprintf(stderr, "[server] loading tokenizer from %s\n", bargs.model_path); Tokenizer tokenizer;