Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dflash/src/common/model_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ struct ModelBackend {
// supports_dflash_spec_decode() returns true. Default returns nullptr.
virtual class DFlashTarget * dflash_target() { return nullptr; }

// Release oversized scratch buffers between requests to prevent VRAM
// growth over time. Default is a no-op.
virtual void release_scratch() {}

// ── Cleanup ──────────────────────────────────────────────────────
// Release all resources (weights, cache, snapshots, drafter).
// Called by run_daemon() before returning.
Expand Down
28 changes: 28 additions & 0 deletions dflash/src/qwen35/qwen35_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "ggml-cuda.h"
#include "common/snapshot_backend.h"
#include "pflash_ggml_adapter.h"
#include "flashprefill.h"

#include <algorithm>
#include <chrono>
Expand Down Expand Up @@ -436,6 +438,32 @@ void Qwen35Backend::shutdown() {
}
}

// ── Release scratch buffers between requests ────────────────────────────

void Qwen35Backend::release_scratch() {
// Target graph allocator: grows during large prefill batches, not needed
// between requests. Will be lazily recreated on next build_target_step().
if (sg_.alloc) {
ggml_gallocr_free(sg_.alloc);
sg_.alloc = nullptr;
}
step_graph_free(sg_);

// LM-head projection allocator (same pattern).
if (proj_sg_.alloc) {
ggml_gallocr_free(proj_sg_.alloc);
proj_sg_.alloc = nullptr;
}
step_graph_free(proj_sg_);

// BSA persistent CUDA buffers (blockmask, head_mask_type, softmax_lse).
#ifdef DFLASH27B_HAVE_BSA
flashprefill::dflash_bsa_free_persistent();
#endif

std::fprintf(stderr, "[vram] released scratch buffers\n");
}

// ── Generate (speculative decode) ───────────────────────────────────────

GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
Expand Down
4 changes: 4 additions & 0 deletions dflash/src/qwen35/qwen35_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ class Qwen35Backend : public ModelBackend {

void shutdown() override;

// Release oversized scratch buffers (gallocr, BSA cache) between requests
// to prevent VRAM growth over time.
void release_scratch() override;

private:
// ── Configuration ────────────────────────────────────────────────
Qwen35Config cfg_;
Expand Down
15 changes: 15 additions & 0 deletions dflash/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,13 +712,28 @@ void HttpServer::worker_loop() {
};

// Run generation (with or without restore).
// Lazy-draft: ensure decode draft is loaded before generate.
if (config_.lazy_draft) {
backend_.free_drafter(); // free pflash drafter (~1.4 GB) if loaded
backend_.unpark("draft"); // reload decode draft (~3.3 GB)
}

GenerateResult result;
if (using_restore) {
result = backend_.restore_and_generate(cache_slot, gen_req, io);
} else {
result = backend_.generate(gen_req, io);
}

// Lazy-draft: park decode draft after generate to free VRAM.
if (config_.lazy_draft) {
backend_.park("draft");
}

// Release oversized scratch buffers (gallocr, BSA cache) so VRAM
// doesn't grow monotonically across requests with different sizes.
backend_.release_scratch();

// Confirm or abort the inline snapshot.
if (snap_prepared) {
if (completion_tokens > 0 && !client_disconnected) {
Expand Down
1 change: 1 addition & 0 deletions dflash/src/server/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct ServerConfig {
float pflash_keep_ratio = 0.05f; // fraction of tokens to keep
std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B)
bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs
bool lazy_draft = false; // park decode draft when idle to save VRAM

// Disk prefix cache
std::string disk_cache_dir; // empty = disabled
Expand Down
18 changes: 18 additions & 0 deletions dflash/src/server/server_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ static void print_usage(const char * prog) {
" --prefill-keep-ratio <F> Fraction of tokens to keep (default: 0.05)\n"
" --prefill-drafter <path> Drafter GGUF for compression (Qwen3-0.6B)\n"
" --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n"
" --lazy-draft Park decode draft when idle to save VRAM\n"
"\n"
"Disk KV cache:\n"
" --kv-cache-dir <path> Directory for ondisk KV cache (enables feature)\n"
Expand Down Expand Up @@ -140,6 +141,8 @@ int main(int argc, char ** argv) {
sconfig.pflash_drafter_path = argv[++i];
} else if (std::strcmp(argv[i], "--prefill-skip-park") == 0) {
sconfig.pflash_skip_park = true;
} else if (std::strcmp(argv[i], "--lazy-draft") == 0) {
sconfig.lazy_draft = true;
} else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) {
sconfig.disk_cache_dir = argv[++i];
} else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) {
Expand Down Expand Up @@ -194,6 +197,12 @@ int main(int argc, char ** argv) {
setenv("DFLASH27B_FA_WINDOW", "0", 0);
}

// Lazy-draft requires both prefill-drafter AND decode draft to be useful.
if (sconfig.lazy_draft && !(pflash_enabled && bargs.draft_path)) {
std::fprintf(stderr, "[server] --lazy-draft ignored: requires both --prefill-drafter and --draft\n");
sconfig.lazy_draft = false;
}

// Load tokenizer.
std::fprintf(stderr, "[server] loading tokenizer from %s\n", bargs.model_path);
Tokenizer tokenizer;
Expand Down Expand Up @@ -269,6 +278,9 @@ int main(int argc, char ** argv) {
std::fprintf(stderr, "[server] │ fp_use_bsa = %s\n", getenv("DFLASH_FP_USE_BSA") ? "ON" : "off");
std::fprintf(stderr, "[server] │ fp_alpha = %s\n", getenv("DFLASH_FP_ALPHA") ? getenv("DFLASH_FP_ALPHA") : "0.12 (default)");
}
if (bargs.draft_path) {
std::fprintf(stderr, "[server] │ lazy_draft = %s\n", sconfig.lazy_draft ? "ON" : "off");
}
std::fprintf(stderr, "[server] ╰─────────────────────────────────────────────────────╯\n\n");

HttpServer server(*backend, tokenizer, sconfig);
Expand All @@ -278,6 +290,12 @@ int main(int argc, char ** argv) {
if (pflash_enabled) {
server.set_drafter_tokenizer(&drafter_tokenizer);
}

// Lazy-draft: park decode draft at startup to free VRAM (~3.3 GB).
if (sconfig.lazy_draft && bargs.draft_path) {
backend->park("draft");
}

int ret = server.run();

// Cleanup.
Expand Down
Loading