fix: release scratch VRAM buffers between requests

howard0su · Copilot · howard0su · commit 33e35a7564c5 · 2026-05-22T08:32:30.000+08:00
The target gallocr, LM-head projection gallocr, and BSA persistent
CUDA buffers grow monotonically with request size but never shrink.
After a large-prompt request (e.g. agent 2k tokens), subsequent
smaller requests suffer VRAM pressure causing KV cache spill to
system RAM and ~2x decode slowdown.

Add ModelBackend::release_scratch() called after each HTTP request
completes. Qwen35Backend implementation frees:
- sg_.alloc (target graph allocator)
- proj_sg_.alloc (LM-head projection allocator)
- BSA persistent device buffers (blockmask, head_mask_type, softmax_lse)

All are lazily recreated at the exact size needed on the next request.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h
@@ -174,6 +174,10 @@ struct ModelBackend {
     // supports_dflash_spec_decode() returns true. Default returns nullptr.
     virtual class DFlashTarget * dflash_target() { return nullptr; }
 
+    // Release oversized scratch buffers between requests to prevent VRAM
+    // growth over time. Default is a no-op.
+    virtual void release_scratch() {}
+
     // ── Cleanup ──────────────────────────────────────────────────────
     // Release all resources (weights, cache, snapshots, drafter).
     // Called by run_daemon() before returning.
diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp
@@ -13,6 +13,8 @@
 
 #include "ggml-cuda.h"
 #include "common/snapshot_backend.h"
+#include "pflash_ggml_adapter.h"
+#include "flashprefill.h"
 
 #include <algorithm>
 #include <chrono>
@@ -436,6 +438,32 @@ void Qwen35Backend::shutdown() {
     }
 }
 
+// ── Release scratch buffers between requests ────────────────────────────
+
+void Qwen35Backend::release_scratch() {
+    // Target graph allocator: grows during large prefill batches, not needed
+    // between requests. Will be lazily recreated on next build_target_step().
+    if (sg_.alloc) {
+        ggml_gallocr_free(sg_.alloc);
+        sg_.alloc = nullptr;
+    }
+    step_graph_free(sg_);
+
+    // LM-head projection allocator (same pattern).
+    if (proj_sg_.alloc) {
+        ggml_gallocr_free(proj_sg_.alloc);
+        proj_sg_.alloc = nullptr;
+    }
+    step_graph_free(proj_sg_);
+
+    // BSA persistent CUDA buffers (blockmask, head_mask_type, softmax_lse).
+#ifdef DFLASH27B_HAVE_BSA
+    flashprefill::dflash_bsa_free_persistent();
+#endif
+
+    std::fprintf(stderr, "[vram] released scratch buffers\n");
+}
+
 // ── Generate (speculative decode) ───────────────────────────────────────
 
 GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h
@@ -109,6 +109,10 @@ class Qwen35Backend : public ModelBackend {
 
     void shutdown() override;
 
+    // Release oversized scratch buffers (gallocr, BSA cache) between requests
+    // to prevent VRAM growth over time.
+    void release_scratch() override;
+
 private:
     // ── Configuration ────────────────────────────────────────────────
     Qwen35Config cfg_;
diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp
@@ -730,6 +730,10 @@ void HttpServer::worker_loop() {
             backend_.park("draft");
         }
 
+        // Release oversized scratch buffers (gallocr, BSA cache) so VRAM
+        // doesn't grow monotonically across requests with different sizes.
+        backend_.release_scratch();
+
         // Confirm or abort the inline snapshot.
         if (snap_prepared) {
             if (completion_tokens > 0 && !client_disconnected) {