Skip to content

Commit 186bccc

Browse files
committed
fix(mtp): cubic review — invariant-guard prefill capture + hoist migrate to init
P1: capture-invariant violation now fails loud instead of clearing all_prefill_hidden mid-loop, which let the next chunk's memcpy write past freed memory (heap UB). P2: migrate_prefill_cache moved out of generate() into init_mtp_(); max_ctx and gamma are config-time constants, so checking the bool return where backend init can fail cleanly removes the OOM-on-first-request → null ssm_intermediate → segfault path. PR Luce-Org#214 review-4308296776 (cubic-bot P1+P2).
1 parent 3f26661 commit 186bccc

2 files changed

Lines changed: 29 additions & 10 deletions

File tree

dflash/src/common/mtp_orchestrator.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,17 @@ GenerateResult warm_and_decode(ModelBackend * backend,
7272
}
7373
int n_chunk = 0;
7474
const float * h_seq = target->last_hidden_seq(&n_chunk);
75-
if (h_seq && n_chunk == n) {
76-
std::memcpy(all_prefill_hidden.data() + (size_t)start * hidden,
77-
h_seq, sizeof(float) * (size_t)n * hidden);
78-
} else {
79-
all_prefill_hidden.clear(); // warm_head_kv will be skipped
75+
// Invariant: capture is enabled+pinned by MTP attach, so verify_batch
76+
// must return the full chunk. If it doesn't, fail loud rather than
77+
// silently mangle all_prefill_hidden — clearing it (the pre-fix
78+
// behavior) made the next chunk's memcpy write past freed memory.
79+
if (!h_seq || n_chunk != n) {
80+
result.error = "warm_and_decode: hidden seq capture invariant violated";
81+
io.emit(-1);
82+
return result;
8083
}
84+
std::memcpy(all_prefill_hidden.data() + (size_t)start * hidden,
85+
h_seq, sizeof(float) * (size_t)n * hidden);
8186
start += n;
8287
}
8388
result.prefill_s = std::chrono::duration<double>(

dflash/src/qwen35/qwen35_backend.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -355,12 +355,11 @@ GenerateResult Qwen35Backend::generate(const GenerateRequest & req,
355355
// didn't.
356356
reset_target_cache(cache_);
357357

358-
// MTP path: delegate to common orchestrator (step 3.2 refactor).
358+
// MTP path: delegate to common orchestrator. Cache was already sized in
359+
// init_mtp_() — no per-request migrate (would be idempotent no-op since
360+
// rollback_ctx is set; checking return there was a latent OOM crash path,
361+
// per momus audit of cubic#3257248868).
359362
if (supports_mtp()) {
360-
const int gamma_eff = (cfg_.mtp_gamma > 0) ? cfg_.mtp_gamma : 3;
361-
migrate_prefill_cache(w_, cfg_.device.max_ctx,
362-
std::max(gamma_eff + 1, DFLASH27B_DRAFT_BLOCK_SIZE),
363-
target_backend_, cache_);
364363
return common::mtp::warm_and_decode(this, req, io);
365364
}
366365

@@ -726,6 +725,21 @@ bool Qwen35Backend::init_mtp_() {
726725
// module->max_gamma()) cannot happen by construction once this is in place.
727726
mtp_module_->set_effective_gamma(cfg_.mtp_gamma);
728727

728+
// Pre-size the rollback cache once for the MTP gamma chosen at attach.
729+
// Per momus audit of cubic#3257248868: hoisting out of generate() makes
730+
// the OOM-on-first-request → nullptr ssm_intermediate → segfault path
731+
// unreachable (max_ctx + γ are config-time constants; check return here
732+
// where we can fail backend init cleanly).
733+
const int gamma_eff = (cfg_.mtp_gamma > 0) ? cfg_.mtp_gamma : 3;
734+
if (!migrate_prefill_cache(w_, cfg_.device.max_ctx,
735+
std::max(gamma_eff + 1, DFLASH27B_DRAFT_BLOCK_SIZE),
736+
target_backend_, cache_)) {
737+
std::fprintf(stderr, "[mtp] migrate_prefill_cache failed (max_ctx=%d gamma=%d)\n",
738+
cfg_.device.max_ctx, gamma_eff);
739+
mtp_module_.reset();
740+
return false;
741+
}
742+
729743
if (cfg_.mtp_draft_source && std::strcmp(cfg_.mtp_draft_source, "mtp_topk") == 0) {
730744
mtp_module_->set_draft_topk(std::max(1, cfg_.mtp_draft_topk));
731745
}

0 commit comments

Comments
 (0)