@@ -68,7 +68,7 @@ static void print_usage(const char * prog) {
6868 " --prefill-keep-ratio <F> Fraction of tokens to keep (default: 0.05)\n "
6969 " --prefill-drafter <path> Drafter GGUF for compression (Qwen3-0.6B)\n "
7070 " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n "
71- " --no- lazy-draft Keep decode draft loaded at all times \n "
71+ " --lazy-draft Park decode draft when idle to save VRAM \n "
7272 " \n "
7373 " Disk KV cache:\n "
7474 " --kv-cache-dir <path> Directory for ondisk KV cache (enables feature)\n "
@@ -141,8 +141,8 @@ int main(int argc, char ** argv) {
141141 sconfig.pflash_drafter_path = argv[++i];
142142 } else if (std::strcmp (argv[i], " --prefill-skip-park" ) == 0 ) {
143143 sconfig.pflash_skip_park = true ;
144- } else if (std::strcmp (argv[i], " --no- lazy-draft" ) == 0 ) {
145- sconfig.lazy_draft = false ;
144+ } else if (std::strcmp (argv[i], " --lazy-draft" ) == 0 ) {
145+ sconfig.lazy_draft = true ;
146146 } else if (std::strcmp (argv[i], " --kv-cache-dir" ) == 0 && i + 1 < argc) {
147147 sconfig.disk_cache_dir = argv[++i];
148148 } else if (std::strcmp (argv[i], " --kv-cache-budget" ) == 0 && i + 1 < argc) {
@@ -197,6 +197,12 @@ int main(int argc, char ** argv) {
197197 setenv (" DFLASH27B_FA_WINDOW" , " 0" , 0 );
198198 }
199199
200+ // Lazy-draft requires both prefill-drafter AND decode draft to be useful.
201+ if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path )) {
202+ std::fprintf (stderr, " [server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n " );
203+ sconfig.lazy_draft = false ;
204+ }
205+
200206 // Load tokenizer.
201207 std::fprintf (stderr, " [server] loading tokenizer from %s\n " , bargs.model_path );
202208 Tokenizer tokenizer;
0 commit comments