Skip to content

Commit 88d5b62

Browse files
committed
Make lazy-draft default to off
1 parent 33e35a7 commit 88d5b62

2 files changed

Lines changed: 10 additions & 4 deletions

File tree

dflash/src/server/http_server.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct ServerConfig {
5656
float pflash_keep_ratio = 0.05f; // fraction of tokens to keep
5757
std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B)
5858
bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs
59-
bool lazy_draft = true; // park decode draft when idle to save VRAM
59+
bool lazy_draft = false; // park decode draft when idle to save VRAM
6060

6161
// Disk prefix cache
6262
std::string disk_cache_dir; // empty = disabled

dflash/src/server/server_main.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ static void print_usage(const char * prog) {
6868
" --prefill-keep-ratio <F> Fraction of tokens to keep (default: 0.05)\n"
6969
" --prefill-drafter <path> Drafter GGUF for compression (Qwen3-0.6B)\n"
7070
" --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n"
71-
" --no-lazy-draft Keep decode draft loaded at all times\n"
71+
" --lazy-draft Park decode draft when idle to save VRAM\n"
7272
"\n"
7373
"Disk KV cache:\n"
7474
" --kv-cache-dir <path> Directory for ondisk KV cache (enables feature)\n"
@@ -141,8 +141,8 @@ int main(int argc, char ** argv) {
141141
sconfig.pflash_drafter_path = argv[++i];
142142
} else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
143143
sconfig.pflash_skip_park = true;
144-
} else if (std::strcmp(argv[i], "--no-lazy-draft") == 0) {
145-
sconfig.lazy_draft = false;
144+
} else if (std::strcmp(argv[i], "--lazy-draft") == 0) {
145+
sconfig.lazy_draft = true;
146146
} else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) {
147147
sconfig.disk_cache_dir = argv[++i];
148148
} else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) {
@@ -197,6 +197,12 @@ int main(int argc, char ** argv) {
197197
setenv("DFLASH27B_FA_WINDOW", "0", 0);
198198
}
199199

200+
// Lazy-draft requires both prefill-drafter AND decode draft to be useful.
201+
if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) {
202+
std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n");
203+
sconfig.lazy_draft = false;
204+
}
205+
200206
// Load tokenizer.
201207
std::fprintf(stderr, "[server] loading tokenizer from %s\n", bargs.model_path);
202208
Tokenizer tokenizer;

0 commit comments

Comments
 (0)