diff --git a/common/speculative.cpp b/common/speculative.cpp index 349d23dcd201..821c30b36d46 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1288,6 +1288,15 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) { common_speculative * common_speculative_init( common_params_speculative & params, llama_context * ctx_tgt) { + // Defensive: if speculative was disabled upstream (e.g. server disables it when mmproj + // is loaded), bail out before any impl construction. Without this guard, a caller that + // sets params.type=NONE but leaves params.mparams_dft.path (set via --mtp-head/--model-draft) + // would still trigger the DRAFT config below with ctx_dft=nullptr, crashing in the + // common_speculative_state_draft ctor at llama_n_batch(ctx_dft). + if (params.type == COMMON_SPECULATIVE_TYPE_NONE) { + return nullptr; + } + llama_context * ctx_dft = nullptr; // Gemma4 MTP loads the assistant into the target model (llama_model_load_mtp_from_file); no second context. if (params.model_dft && params.type != COMMON_SPECULATIVE_TYPE_MTP) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index fea3ed875026..d49e712b498c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -735,6 +735,11 @@ struct server_context_impl { if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; + // Also clear the draft model path so common_speculative_init does not + // observe an orphan has_draft=true with type=NONE (would build a DRAFT + // config and crash on ctx_dft=nullptr). See common/speculative.cpp init. + params_base.speculative.mparams_dft.path.clear(); + params_base.speculative.model_dft = nullptr; SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled"); } }