Skip to content

Commit 33e35a7

Browse files
howard0suCopilot
andcommitted
fix: release scratch VRAM buffers between requests
The target gallocr, LM-head projection gallocr, and BSA persistent CUDA buffers grow monotonically with request size but never shrink. After a large-prompt request (e.g. agent 2k tokens), subsequent smaller requests suffer VRAM pressure causing KV cache spill to system RAM and ~2x decode slowdown. Add ModelBackend::release_scratch() called after each HTTP request completes. Qwen35Backend implementation frees: - sg_.alloc (target graph allocator) - proj_sg_.alloc (LM-head projection allocator) - BSA persistent device buffers (blockmask, head_mask_type, softmax_lse) All are lazily recreated at the exact size needed on the next request. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 3f10692 commit 33e35a7

4 files changed

Lines changed: 40 additions & 0 deletions

File tree

dflash/src/common/model_backend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,10 @@ struct ModelBackend {
174174
// supports_dflash_spec_decode() returns true. Default returns nullptr.
175175
virtual class DFlashTarget * dflash_target() { return nullptr; }
176176

177+
// Release oversized scratch buffers between requests to prevent VRAM
178+
// growth over time. Default is a no-op.
179+
virtual void release_scratch() {}
180+
177181
// ── Cleanup ──────────────────────────────────────────────────────
178182
// Release all resources (weights, cache, snapshots, drafter).
179183
// Called by run_daemon() before returning.

dflash/src/qwen35/qwen35_backend.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "ggml-cuda.h"
1515
#include "common/snapshot_backend.h"
16+
#include "pflash_ggml_adapter.h"
17+
#include "flashprefill.h"
1618

1719
#include <algorithm>
1820
#include <chrono>
@@ -436,6 +438,32 @@ void Qwen35Backend::shutdown() {
436438
}
437439
}
438440

441+
// ── Release scratch buffers between requests ────────────────────────────
442+
443+
void Qwen35Backend::release_scratch() {
444+
// Target graph allocator: grows during large prefill batches, not needed
445+
// between requests. Will be lazily recreated on next build_target_step().
446+
if (sg_.alloc) {
447+
ggml_gallocr_free(sg_.alloc);
448+
sg_.alloc = nullptr;
449+
}
450+
step_graph_free(sg_);
451+
452+
// LM-head projection allocator (same pattern).
453+
if (proj_sg_.alloc) {
454+
ggml_gallocr_free(proj_sg_.alloc);
455+
proj_sg_.alloc = nullptr;
456+
}
457+
step_graph_free(proj_sg_);
458+
459+
// BSA persistent CUDA buffers (blockmask, head_mask_type, softmax_lse).
460+
#ifdef DFLASH27B_HAVE_BSA
461+
flashprefill::dflash_bsa_free_persistent();
462+
#endif
463+
464+
std::fprintf(stderr, "[vram] released scratch buffers\n");
465+
}
466+
439467
// ── Generate (speculative decode) ───────────────────────────────────────
440468

441469
GenerateResult Qwen35Backend::generate(const GenerateRequest & req,

dflash/src/qwen35/qwen35_backend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ class Qwen35Backend : public ModelBackend {
109109

110110
void shutdown() override;
111111

112+
// Release oversized scratch buffers (gallocr, BSA cache) between requests
113+
// to prevent VRAM growth over time.
114+
void release_scratch() override;
115+
112116
private:
113117
// ── Configuration ────────────────────────────────────────────────
114118
Qwen35Config cfg_;

dflash/src/server/http_server.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,10 @@ void HttpServer::worker_loop() {
730730
backend_.park("draft");
731731
}
732732

733+
// Release oversized scratch buffers (gallocr, BSA cache) so VRAM
734+
// doesn't grow monotonically across requests with different sizes.
735+
backend_.release_scratch();
736+
733737
// Confirm or abort the inline snapshot.
734738
if (snap_prepared) {
735739
if (completion_tokens > 0 && !client_disconnected) {

0 commit comments

Comments
 (0)