fix(mtp): cubic review — invariant-guard prefill capture + hoist migrate to init

dusterbloom · dusterbloom · commit 186bccc9f2ab · 2026-05-18T11:16:33.000+02:00
P1: capture-invariant violation now fails loud instead of clearing all_prefill_hidden mid-loop, which let the next chunk's memcpy write past freed memory (heap UB). P2: migrate_prefill_cache moved out of generate() into init_mtp_(); max_ctx and gamma are config-time constants, so checking the bool return where backend init can fail cleanly removes the OOM-on-first-request → null ssm_intermediate → segfault path. PR Luce-Org#214 review-4308296776 (cubic-bot P1+P2).
diff --git a/dflash/src/common/mtp_orchestrator.cpp b/dflash/src/common/mtp_orchestrator.cpp
@@ -72,12 +72,17 @@ GenerateResult warm_and_decode(ModelBackend * backend,
         }
         int n_chunk = 0;
         const float * h_seq = target->last_hidden_seq(&n_chunk);
-        if (h_seq && n_chunk == n) {
-            std::memcpy(all_prefill_hidden.data() + (size_t)start * hidden,
-                        h_seq, sizeof(float) * (size_t)n * hidden);
-        } else {
-            all_prefill_hidden.clear();  // warm_head_kv will be skipped
+        // Invariant: capture is enabled+pinned by MTP attach, so verify_batch
+        // must return the full chunk. If it doesn't, fail loud rather than
+        // silently mangle all_prefill_hidden — clearing it (the pre-fix
+        // behavior) made the next chunk's memcpy write past freed memory.
+        if (!h_seq || n_chunk != n) {
+            result.error = "warm_and_decode: hidden seq capture invariant violated";
+            io.emit(-1);
+            return result;
         }
+        std::memcpy(all_prefill_hidden.data() + (size_t)start * hidden,
+                    h_seq, sizeof(float) * (size_t)n * hidden);
         start += n;
     }
     result.prefill_s = std::chrono::duration<double>(
diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp
@@ -355,12 +355,11 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
     // didn't.
     reset_target_cache(cache_);
 
-    // MTP path: delegate to common orchestrator (step 3.2 refactor).
+    // MTP path: delegate to common orchestrator. Cache was already sized in
+    // init_mtp_() — no per-request migrate (would be idempotent no-op since
+    // rollback_ctx is set; checking return there was a latent OOM crash path,
+    // per momus audit of cubic#3257248868).
     if (supports_mtp()) {
-        const int gamma_eff = (cfg_.mtp_gamma > 0) ? cfg_.mtp_gamma : 3;
-        migrate_prefill_cache(w_, cfg_.device.max_ctx,
-                              std::max(gamma_eff + 1, DFLASH27B_DRAFT_BLOCK_SIZE),
-                              target_backend_, cache_);
         return common::mtp::warm_and_decode(this, req, io);
     }
 
@@ -726,6 +725,21 @@ bool Qwen35Backend::init_mtp_() {
     // module->max_gamma()) cannot happen by construction once this is in place.
     mtp_module_->set_effective_gamma(cfg_.mtp_gamma);
 
+    // Pre-size the rollback cache once for the MTP gamma chosen at attach.
+    // Per momus audit of cubic#3257248868: hoisting out of generate() makes
+    // the OOM-on-first-request → nullptr ssm_intermediate → segfault path
+    // unreachable (max_ctx + γ are config-time constants; check return here
+    // where we can fail backend init cleanly).
+    const int gamma_eff = (cfg_.mtp_gamma > 0) ? cfg_.mtp_gamma : 3;
+    if (!migrate_prefill_cache(w_, cfg_.device.max_ctx,
+                               std::max(gamma_eff + 1, DFLASH27B_DRAFT_BLOCK_SIZE),
+                               target_backend_, cache_)) {
+        std::fprintf(stderr, "[mtp] migrate_prefill_cache failed (max_ctx=%d gamma=%d)\n",
+                     cfg_.device.max_ctx, gamma_eff);
+        mtp_module_.reset();
+        return false;
+    }
+
     if (cfg_.mtp_draft_source && std::strcmp(cfg_.mtp_draft_source, "mtp_topk") == 0) {
         mtp_module_->set_draft_topk(std::max(1, cfg_.mtp_draft_topk));
     }