diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h index 504b68eb..6eb5b74a 100644 --- a/dflash/src/common/model_backend.h +++ b/dflash/src/common/model_backend.h @@ -174,6 +174,10 @@ struct ModelBackend { // supports_dflash_spec_decode() returns true. Default returns nullptr. virtual class DFlashTarget * dflash_target() { return nullptr; } + // Release oversized scratch buffers between requests to prevent VRAM + // growth over time. Default is a no-op. + virtual void release_scratch() {} + // ── Cleanup ────────────────────────────────────────────────────── // Release all resources (weights, cache, snapshots, drafter). // Called by run_daemon() before returning. diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp index 8b08d69e..da49d9a0 100644 --- a/dflash/src/qwen35/qwen35_backend.cpp +++ b/dflash/src/qwen35/qwen35_backend.cpp @@ -13,6 +13,8 @@ #include "ggml-cuda.h" #include "common/snapshot_backend.h" +#include "pflash_ggml_adapter.h" +#include "flashprefill.h" #include #include @@ -436,6 +438,32 @@ void Qwen35Backend::shutdown() { } } +// ── Release scratch buffers between requests ──────────────────────────── + +void Qwen35Backend::release_scratch() { + // Target graph allocator: grows during large prefill batches, not needed + // between requests. Will be lazily recreated on next build_target_step(). + if (sg_.alloc) { + ggml_gallocr_free(sg_.alloc); + sg_.alloc = nullptr; + } + step_graph_free(sg_); + + // LM-head projection allocator (same pattern). + if (proj_sg_.alloc) { + ggml_gallocr_free(proj_sg_.alloc); + proj_sg_.alloc = nullptr; + } + step_graph_free(proj_sg_); + + // BSA persistent CUDA buffers (blockmask, head_mask_type, softmax_lse). +#ifdef DFLASH27B_HAVE_BSA + flashprefill::dflash_bsa_free_persistent(); +#endif + + std::fprintf(stderr, "[vram] released scratch buffers\n"); +} + // ── Generate (speculative decode) ─────────────────────────────────────── GenerateResult Qwen35Backend::generate(const GenerateRequest & req, diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h index d87f5f0b..f94f6ef4 100644 --- a/dflash/src/qwen35/qwen35_backend.h +++ b/dflash/src/qwen35/qwen35_backend.h @@ -109,6 +109,10 @@ class Qwen35Backend : public ModelBackend { void shutdown() override; + // Release oversized scratch buffers (gallocr, BSA cache) between requests + // to prevent VRAM growth over time. + void release_scratch() override; + private: // ── Configuration ──────────────────────────────────────────────── Qwen35Config cfg_; diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index 5d61da30..fb3d3b61 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -712,6 +712,12 @@ void HttpServer::worker_loop() { }; // Run generation (with or without restore). + // Lazy-draft: ensure decode draft is loaded before generate. + if (config_.lazy_draft) { + backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded + backend_.unpark("draft"); // reload decode draft (~3.3 GB) + } + GenerateResult result; if (using_restore) { result = backend_.restore_and_generate(cache_slot, gen_req, io); @@ -719,6 +725,15 @@ void HttpServer::worker_loop() { result = backend_.generate(gen_req, io); } + // Lazy-draft: park decode draft after generate to free VRAM. + if (config_.lazy_draft) { + backend_.park("draft"); + } + + // Release oversized scratch buffers (gallocr, BSA cache) so VRAM + // doesn't grow monotonically across requests with different sizes. + backend_.release_scratch(); + // Confirm or abort the inline snapshot. if (snap_prepared) { if (completion_tokens > 0 && !client_disconnected) { diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index 8c0ec9eb..24d075d8 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -56,6 +56,7 @@ struct ServerConfig { float pflash_keep_ratio = 0.05f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs + bool lazy_draft = false; // park decode draft when idle to save VRAM // Disk prefix cache std::string disk_cache_dir; // empty = disabled diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index 319f97de..627e4c64 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -68,6 +68,7 @@ static void print_usage(const char * prog) { " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" + " --lazy-draft Park decode draft when idle to save VRAM\n" "\n" "Disk KV cache:\n" " --kv-cache-dir Directory for ondisk KV cache (enables feature)\n" @@ -140,6 +141,8 @@ int main(int argc, char ** argv) { sconfig.pflash_drafter_path = argv[++i]; } else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) { sconfig.pflash_skip_park = true; + } else if (std::strcmp(argv[i], "--lazy-draft") == 0) { + sconfig.lazy_draft = true; } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { sconfig.disk_cache_dir = argv[++i]; } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) { @@ -194,6 +197,12 @@ int main(int argc, char ** argv) { setenv("DFLASH27B_FA_WINDOW", "0", 0); } + // Lazy-draft requires both prefill-drafter AND decode draft to be useful. + if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) { + std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n"); + sconfig.lazy_draft = false; + } + // Load tokenizer. std::fprintf(stderr, "[server] loading tokenizer from %s\n", bargs.model_path); Tokenizer tokenizer; @@ -269,6 +278,9 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off"); std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)"); } + if (bargs.draft_path) { + std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off"); + } std::fprintf(stderr, "[server] ╰─────────────────────────────────────────────────────╯\n\n"); HttpServer server(*backend, tokenizer, sconfig); @@ -278,6 +290,12 @@ int main(int argc, char ** argv) { if (pflash_enabled) { server.set_drafter_tokenizer(&drafter_tokenizer); } + + // Lazy-draft: park decode draft at startup to free VRAM (~3.3 GB). + if (sconfig.lazy_draft && bargs.draft_path) { + backend->park("draft"); + } + int ret = server.run(); // Cleanup.