Skip to content

Commit 3f10692

Browse files
howard0suCopilot
andcommitted
feat(dflash): add --lazy-draft to C++ server
Park the decode draft model (~3.3 GB) when idle to free VRAM for pflash compression. Before generate, free the pflash drafter and unpark the decode draft; after generate, park draft again. Flow: startup → park draft | request → compress → free pflash drafter → unpark draft → generate → park draft Saves ~3.3 GB VRAM on idle, enabling longer context on 22 GB GPUs. Port of Python server.py --lazy-draft behavior to the C++ in-process server. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 538bf53 commit 3f10692

3 files changed

Lines changed: 24 additions & 0 deletions

File tree

dflash/src/server/http_server.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,13 +712,24 @@ void HttpServer::worker_loop() {
712712
};
713713

714714
// Run generation (with or without restore).
715+
// Lazy-draft: ensure decode draft is loaded before generate.
716+
if (config_.lazy_draft) {
717+
backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded
718+
backend_.unpark("draft"); // reload decode draft (~3.3 GB)
719+
}
720+
715721
GenerateResult result;
716722
if (using_restore) {
717723
result = backend_.restore_and_generate(cache_slot, gen_req, io);
718724
} else {
719725
result = backend_.generate(gen_req, io);
720726
}
721727

728+
// Lazy-draft: park decode draft after generate to free VRAM.
729+
if (config_.lazy_draft) {
730+
backend_.park("draft");
731+
}
732+
722733
// Confirm or abort the inline snapshot.
723734
if (snap_prepared) {
724735
if (completion_tokens > 0 && !client_disconnected) {

dflash/src/server/http_server.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ struct ServerConfig {
5656
float pflash_keep_ratio = 0.05f; // fraction of tokens to keep
5757
std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B)
5858
bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs
59+
bool lazy_draft = true; // park decode draft when idle to save VRAM
5960

6061
// Disk prefix cache
6162
std::string disk_cache_dir; // empty = disabled

dflash/src/server/server_main.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ static void print_usage(const char * prog) {
6868
" --prefill-keep-ratio <F> Fraction of tokens to keep (default: 0.05)\n"
6969
" --prefill-drafter <path> Drafter GGUF for compression (Qwen3-0.6B)\n"
7070
" --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n"
71+
" --no-lazy-draft Keep decode draft loaded at all times\n"
7172
"\n"
7273
"Disk KV cache:\n"
7374
" --kv-cache-dir <path> Directory for ondisk KV cache (enables feature)\n"
@@ -140,6 +141,8 @@ int main(int argc, char ** argv) {
140141
sconfig.pflash_drafter_path = argv[++i];
141142
} else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
142143
sconfig.pflash_skip_park = true;
144+
} else if (std::strcmp(argv[i], "--no-lazy-draft") == 0) {
145+
sconfig.lazy_draft = false;
143146
} else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) {
144147
sconfig.disk_cache_dir = argv[++i];
145148
} else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) {
@@ -269,6 +272,9 @@ int main(int argc, char ** argv) {
269272
std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off");
270273
std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)");
271274
}
275+
if (bargs.draft_path) {
276+
std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off");
277+
}
272278
std::fprintf(stderr, "[server] ╰─────────────────────────────────────────────────────╯\n\n");
273279

274280
HttpServer server(*backend, tokenizer, sconfig);
@@ -278,6 +284,12 @@ int main(int argc, char ** argv) {
278284
if (pflash_enabled) {
279285
server.set_drafter_tokenizer(&drafter_tokenizer);
280286
}
287+
288+
// Lazy-draft: park decode draft at startup to free VRAM (~3.3 GB).
289+
if (sconfig.lazy_draft && bargs.draft_path) {
290+
backend->park("draft");
291+
}
292+
281293
int ret = server.run();
282294

283295
// Cleanup.

0 commit comments

Comments
 (0)