|
16 | 16 | #include <cstddef> |
17 | 17 | #include <cinttypes> |
18 | 18 | #include <cstdio> |
| 19 | +#include <cstring> |
19 | 20 | #include <vector> |
20 | 21 | #include <exception> |
21 | 22 | #include <memory> |
|
33 | 34 |
|
34 | 35 | using json = nlohmann::ordered_json; |
35 | 36 |
|
| 37 | +namespace { |
| 38 | + |
| 39 | +// Token used to replace LLAMA_TOKEN_NULL placeholders when priming Qwen NextN draft KV (see common_speculative_begin). |
| 40 | +static llama_token server_nextn_mtmd_fill_token(const llama_model * model) { |
| 41 | + const llama_vocab * vocab = llama_model_get_vocab(model); |
| 42 | + if (!vocab) { |
| 43 | + return 0; |
| 44 | + } |
| 45 | + static const char * const k_candidates[] = { |
| 46 | + "<|image_pad|>", |
| 47 | + "<|IMAGE_PAD|>", |
| 48 | + "<|vision_pad|>", |
| 49 | + }; |
| 50 | + std::vector<llama_token> buf(32); |
| 51 | + for (const char * piece : k_candidates) { |
| 52 | + const int32_t n = llama_tokenize( |
| 53 | + vocab, piece, (int32_t) std::strlen(piece), |
| 54 | + buf.data(), (int32_t) buf.size(), false, true); |
| 55 | + if (n == 1) { |
| 56 | + return buf[0]; |
| 57 | + } |
| 58 | + } |
| 59 | + const llama_token pad = llama_vocab_pad(vocab); |
| 60 | + return pad != LLAMA_TOKEN_NULL ? pad : 0; |
| 61 | +} |
| 62 | + |
| 63 | +} // namespace |
| 64 | + |
36 | 65 | constexpr int HTTP_POLLING_SECONDS = 1; |
37 | 66 |
|
38 | 67 | // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 |
@@ -820,9 +849,10 @@ struct server_context_impl { |
820 | 849 | SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); |
821 | 850 | } |
822 | 851 |
|
823 | | - if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { |
824 | | - params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; |
825 | | - SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled"); |
| 852 | + if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE && |
| 853 | + !common_speculative_is_mtmd_safe(params_base.speculative.type)) { |
| 854 | + params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; |
| 855 | + SRV_WRN("%s\n", "speculative decoding with this type is not supported by multimodal, it will be disabled"); |
826 | 856 | } |
827 | 857 | } |
828 | 858 |
|
@@ -888,8 +918,8 @@ struct server_context_impl { |
888 | 918 | if (can_spec) { |
889 | 919 | slot.spec = common_speculative_init(params_base.speculative, slot.ctx); |
890 | 920 | if (slot.spec) { |
891 | | - if (mctx) { |
892 | | - SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); |
| 921 | + if (mctx && !common_speculative_all_impls_mtmd_safe(slot.spec)) { |
| 922 | + SRV_ERR("%s\n", "speculative decoding with this type is not supported with multimodal"); |
893 | 923 | return false; |
894 | 924 | } |
895 | 925 | // MTP reads target's KV memory by sequence id; bind to slot.id (server uses slot.id as seq_id). |
@@ -2205,14 +2235,22 @@ struct server_context_impl { |
2205 | 2235 | // generate draft tokens in speculative decoding mode |
2206 | 2236 | // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK] |
2207 | 2237 | // perform the speculative drafting for all sequences at the same time in a single batch |
2208 | | - const int n_draft_max = slot.get_n_draft_max(); |
2209 | | - if (n_draft_max > 0) { |
2210 | | - if (mctx) { |
2211 | | - // we should never reach this, as speculative is automatically disabled if mmproj is loaded |
2212 | | - GGML_ABORT("not supported by multimodal"); |
2213 | | - } |
| 2238 | + const int n_draft_max_raw = slot.get_n_draft_max(); |
| 2239 | + const bool mtmd_safe_spec = slot.spec && common_speculative_all_impls_mtmd_safe(slot.spec); |
| 2240 | + if (mctx && n_draft_max_raw > 0 && !mtmd_safe_spec) { |
| 2241 | + GGML_ABORT("not supported by multimodal"); |
| 2242 | + } |
2214 | 2243 |
|
2215 | | - const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); |
| 2244 | + // NextN/MTP prime requires per-token target hidden states which the mtmd image-decode |
| 2245 | + // path does not produce. Until that is wired in, skip drafting for slots whose prompt |
| 2246 | + // contains image chunks - the slot still works as a normal (non-speculative) decode. |
| 2247 | + const bool skip_draft_mtmd = mctx && slot.prompt.tokens.has_mtmd; |
| 2248 | + const int n_draft_max = skip_draft_mtmd ? 0 : n_draft_max_raw; |
| 2249 | + |
| 2250 | + if (n_draft_max > 0) { |
| 2251 | + static const llama_tokens k_empty_prompt_tgt; |
| 2252 | + const llama_tokens & cached_text_tokens = |
| 2253 | + (mctx && mtmd_safe_spec) ? k_empty_prompt_tgt : slot.prompt.tokens.get_text_tokens(); |
2216 | 2254 |
|
2217 | 2255 | const auto & params_spec = slot.task->params.speculative; |
2218 | 2256 |
|
@@ -3008,7 +3046,15 @@ struct server_context_impl { |
3008 | 3046 | slot.state = SLOT_STATE_GENERATING; |
3009 | 3047 |
|
3010 | 3048 | if (slot.can_speculate()) { |
3011 | | - common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens()); |
| 3049 | + if (slot.prompt.tokens.has_mtmd) { |
| 3050 | + // Skip spec begin/prime for mtmd prompts: the per-token target hidden |
| 3051 | + // states for image positions are not currently produced, which makes |
| 3052 | + // NextN prime partial and could desync RoPE positions on later drafts. |
| 3053 | + // The slot will still generate correctly via the non-speculative path. |
| 3054 | + SLT_INF(slot, "%s", "skipping speculative prime for multimodal prompt\n"); |
| 3055 | + } else { |
| 3056 | + common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens()); |
| 3057 | + } |
3012 | 3058 | } |
3013 | 3059 | } else if (slot.state != SLOT_STATE_GENERATING) { |
3014 | 3060 | continue; // continue loop of slots |
@@ -3099,7 +3145,11 @@ struct server_context_impl { |
3099 | 3145 | slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft); |
3100 | 3146 |
|
3101 | 3147 | // add accepted tokens to the prompt |
3102 | | - slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); |
| 3148 | + // note: use push_back loop instead of insert() so mtmd prompts work too |
| 3149 | + // (server_tokens::insert asserts !has_mtmd; push_back is mtmd-safe). |
| 3150 | + for (auto it = ids.begin(); it != ids.end() - 1; ++it) { |
| 3151 | + slot.prompt.tokens.push_back(*it); |
| 3152 | + } |
3103 | 3153 | slot.sampled = ids.back(); // last accepted token |
3104 | 3154 |
|
3105 | 3155 | llama_context_nextn_seq_rm(ctx, slot.id, slot.prompt.n_tokens(), -1); |
|
0 commit comments