diff --git a/common/arg.cpp b/common/arg.cpp
index cb36c5541289..af0bacf21026 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3504,6 +3504,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.mparams_dft.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MTP_HEAD"));
+    add_opt(common_arg(
+        {"--allow-mtp-with-mmproj"},
+        "EXPERIMENTAL: keep MTP speculative decoding enabled when --mmproj is also loaded. "
+        "Default (off) is to disable MTP whenever mmproj is loaded (safe path, no SEGV). "
+        "When on, MTP is gated per-batch by server_tokens::is_pure_text_continuation: image-encoding batches "
+        "fall through to standard decode; pure-text continuation batches use the MTP draft head. "
+        "MTP state is cold-restarted at the image-to-text transition (common_speculative_reset). "
+        "Use this to get MTP speedup on the text reply portion of vision+text requests on the same llama-server "
+        "process. May incur a few tokens of warmup at image-to-text boundaries (cold-restart mode).",
+        [](common_params & params) {
+            params.speculative.allow_mtp_with_mmproj = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALLOW_MTP_WITH_MMPROJ"));
     add_opt(common_arg(
         {"--spec-replace"}, "TARGET", "DRAFT",
         "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
diff --git a/common/common.h b/common/common.h
index 674b24885572..e21aff73090b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -359,6 +359,16 @@ struct common_params_speculative {
     std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
     std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
+    // Phase C.2.2 — opt-in coexistence with multimodal (--mmproj loaded).
+    // When false (default): existing behavior — speculative decoding is unconditionally
+    // disabled if mmproj is loaded (cf. server-context.cpp PR #17 SEGV safety net).
+    // When true: speculative decoding (only MTP is exercised by this path today) stays
+    // enabled, and per-batch dispatch in the server inference loop gates draft invocation
+    // by server_tokens::is_pure_text_continuation. MTP state is cold-restarted at the
+    // image→text transition via common_speculative_reset. Behind a flag because this is
+    // a fundamental dispatch-policy change for multimodal slots.
+    bool allow_mtp_with_mmproj = false;
+
     bool has_dft() const {
         return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 349d23dcd201..9575c4670de3 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -201,6 +201,12 @@ struct common_speculative_state {
 
     // Optional hook: drain any in-flight async work (prepare_next) and discard.
     virtual void cancel() {}
+
+    // Phase C.2.1 — cold-restart hook (foundational, no behavior change here).
+    // Stronger than cancel(): clears all per-iteration state accumulated during a generation.
+    // Default is cancel(); MTP overrides to also zero h_idx + adaptive-skip counters +
+    // cached spec params.
+    virtual void reset() { cancel(); }
 };
 
 struct common_speculative_state_draft : public common_speculative_state {
@@ -845,6 +851,26 @@ struct common_speculative_state_mtp : public common_speculative_state {
         mtp_drain_pending_discard();
     }
 
+    // Phase C.2.1 — cold-restart MTP state at a known boundary (e.g. image-encoding → text continuation).
+    // Drains any in-flight draft (like cancel) AND zeroes h_idx + adaptive-skip counters +
+    // cached spec params. Post-condition: next begin()/draft() pair behaves as if MTP was
+    // just constructed. KV memory and embeddings setting on the target are untouched —
+    // the host owns those.
+    void reset() override {
+        // 1. drain in-flight async draft and clear the one-shot skip flag (cancel semantics)
+        skip_streak_last_draft = false;
+        mtp_drain_pending_discard();
+
+        // 2. zero per-iteration h_prev pointer + adaptive-skip tracking
+        h_idx               = -1;
+        prev_n_acc_drafts   = 0;
+        zero_accept_streak  = 0;
+
+        // 3. forget cached spec params from prior draft() call so the next draft re-computes
+        //    n_steps from scratch when the host passes fresh params.
+        last_spec_params = common_params_speculative{};
+    }
+
     void prepare_next(llama_token id_last) override {
         // Kill switch for A/B testing depth-2 vs sync.
         static const bool depth2_disabled = []() {
@@ -1288,6 +1314,15 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) {
 common_speculative * common_speculative_init(
         common_params_speculative & params,
         llama_context             * ctx_tgt) {
+    // Defensive: if speculative was disabled upstream (e.g. server disables it when mmproj
+    // is loaded), bail out before any impl construction. Without this guard, a caller that
+    // sets params.type=NONE but leaves params.mparams_dft.path (set via --mtp-head/--model-draft)
+    // would still trigger the DRAFT config below with ctx_dft=nullptr, crashing in the
+    // common_speculative_state_draft ctor at llama_n_batch(ctx_dft).
+    if (params.type == COMMON_SPECULATIVE_TYPE_NONE) {
+        return nullptr;
+    }
+
     llama_context * ctx_dft = nullptr;
     // Gemma4 MTP loads the assistant into the target model (llama_model_load_mtp_from_file); no second context.
     if (params.model_dft && params.type != COMMON_SPECULATIVE_TYPE_MTP) {
@@ -1569,6 +1604,15 @@ void common_speculative_cancel(common_speculative * spec) {
     }
 }
 
+void common_speculative_reset(common_speculative * spec) {
+    if (spec == nullptr) {
+        return;
+    }
+    for (auto & impl : spec->impls) {
+        impl->reset();
+    }
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
     if (spec == nullptr) {
         return;
diff --git a/common/speculative.h b/common/speculative.h
index 839237f19d4d..5cea8d3ca4b7 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -58,5 +58,22 @@ void common_speculative_prepare_next(common_speculative * spec, llama_token id_l
 // snapshot (e.g. slot stop / release / new request seq_rm). Safe no-op when nothing is pending.
 void common_speculative_cancel(common_speculative * spec);
 
+// Phase C.2.1 — Cold-restart the speculative state machine (foundational API, no behavior change here).
+//
+// Stronger than cancel(): in addition to draining any in-flight draft, this clears all
+// per-iteration state accumulated during a generation — h_idx is reset to its default
+// (-1 = "last output"), draft-history counters used by adaptive skip (prev_n_acc_drafts,
+// zero_accept_streak, skip_streak_last_draft) are zeroed, and any cached spec params from
+// the previous draft() call are forgotten. After reset(), the implementation behaves as
+// if begin() had just been called on a fresh prompt.
+//
+// Intended use: at known state-boundaries that are NOT prompt boundaries but DO invalidate
+// the assistant's hidden-state assumptions — e.g. when a slot transitions from image-encoding
+// (where MTP was gated off) back to text continuation (where MTP should re-engage from a clean
+// slate). The next few text tokens incur the usual warmup cost but state desync is avoided.
+//
+// Safe no-op for non-MTP implementations.
+void common_speculative_reset(common_speculative * spec);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 51a47b209bba..44106df9261b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -151,6 +151,10 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
     llama_build_and_test(test-sampling.cpp)
     llama_build_and_test(test-speculative-mtp.cpp)
+    # Phase C.2.0 — server_tokens coexistence APIs unit tests
+    llama_build_and_test(test-server-tokens.cpp)
+    target_include_directories(test-server-tokens PRIVATE ${PROJECT_SOURCE_DIR}/tools/server ${PROJECT_SOURCE_DIR}/tools/mtmd)
+    target_link_libraries(test-server-tokens PRIVATE server-context)
     llama_build_and_test(test-reasoning-budget.cpp)
     llama_build_and_test(test-grammar-parser.cpp)
     llama_build_and_test(test-grammar-integration.cpp)
diff --git a/tests/test-server-tokens.cpp b/tests/test-server-tokens.cpp
new file mode 100644
index 000000000000..bf17943e86b7
--- /dev/null
+++ b/tests/test-server-tokens.cpp
@@ -0,0 +1,121 @@
+// Phase C.2.0 — unit tests for server_tokens coexistence APIs introduced for MTP+mmproj dispatch.
+//
+// Scope:
+//   - is_pure_text_continuation(from_idx)
+//   - last_image_end_idx()
+//   - get_text_tokens_post_media()
+//
+// These APIs are foundational and do not change runtime behavior; they expose information that
+// future per-batch dispatch will use. This file covers what is testable WITHOUT loading a model
+// or running the full mtmd pipeline:
+//   - non-multimodal (has_mtmd=false) buffers
+//   - empty multimodal (has_mtmd=true, no chunks) buffers
+//   - empty buffer (size==0) edge cases
+//
+// The WITH-image cases require a real mtmd_input_chunk (which goes through the mtmd public API
+// requiring an image file + clip model). Those are covered by integration tests in C.2.4 once
+// the dispatch behavior is wired up.
+
+#include "server-common.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+#define CHECK(cond, msg)                                                                              \
+    do {                                                                                              \
+        if (!(cond)) {                                                                                \
+            std::fprintf(stderr, "FAIL %s:%d  %s   (cond: %s)\n", __FILE__, __LINE__, msg, #cond);    \
+            std::exit(1);                                                                             \
+        }                                                                                             \
+    } while (0)
+
+static void test_non_mtmd_empty_buffer() {
+    llama_tokens t;
+    server_tokens st(t, /*has_mtmd*/ false);
+
+    CHECK(st.size() == 0,                              "empty size");
+    CHECK(st.empty(),                                  "empty()");
+    CHECK(st.last_image_end_idx() == 0,                "last_image_end_idx empty");
+    CHECK(st.is_pure_text_continuation(0),             "pure-text @ 0 (empty)");
+    CHECK(st.is_pure_text_continuation(100),           "pure-text @ 100 (empty/past-end)");
+
+    llama_tokens out = st.get_text_tokens_post_media();
+    CHECK(out.empty(),                                 "post-media tail empty for empty buffer");
+}
+
+static void test_non_mtmd_text_only() {
+    llama_tokens t = {1, 2, 3, 4, 5};
+    server_tokens st(t, /*has_mtmd*/ false);
+
+    CHECK(st.size() == 5,                              "size==5");
+    CHECK(!st.empty(),                                 "!empty");
+    CHECK(st.last_image_end_idx() == 0,                "last_image_end_idx text-only -> 0");
+
+    // is_pure_text_continuation always true when !has_mtmd
+    CHECK(st.is_pure_text_continuation(0),             "pure-text @ 0");
+    CHECK(st.is_pure_text_continuation(3),             "pure-text @ 3");
+    CHECK(st.is_pure_text_continuation(5),             "pure-text @ 5 (at end)");
+    CHECK(st.is_pure_text_continuation(999),           "pure-text @ 999 (past end)");
+
+    // For non-mtmd, get_text_tokens_post_media returns all tokens (no NULL stripped because none present).
+    llama_tokens out = st.get_text_tokens_post_media();
+    CHECK(out.size() == 5,                             "post-media tail size matches buffer");
+    for (size_t i = 0; i < out.size(); ++i) {
+        CHECK(out[i] == t[i],                          "post-media tail token matches");
+    }
+
+    // get_text_tokens() must still return the canonical reference for non-mtmd path.
+    const llama_tokens & ref = st.get_text_tokens();
+    CHECK(ref.size() == 5,                             "get_text_tokens() size");
+    CHECK(ref.data() != out.data(),                    "post-media tail is a distinct copy");
+}
+
+static void test_mtmd_empty_chunks() {
+    // server_tokens with has_mtmd=true but no media chunks added: same observable behavior as non-mtmd
+    // for the new APIs (per-API contract: empty map → return as text-only).
+    // We construct via the llama_tokens ctor + force has_mtmd=true via the public mutable field
+    // (server_tokens exposes has_mtmd as public — see server-common.h:126).
+    llama_tokens t = {10, 20, 30};
+    server_tokens st(t, /*has_mtmd*/ false);
+    st.has_mtmd = true;  // simulate mtmd-enabled buffer with no chunks yet
+
+    CHECK(st.last_image_end_idx() == 0,                "mtmd+empty-map: last_image_end_idx==0");
+    CHECK(st.is_pure_text_continuation(0),             "mtmd+empty-map: pure @ 0");
+    CHECK(st.is_pure_text_continuation(3),             "mtmd+empty-map: pure @ 3");
+    CHECK(st.is_pure_text_continuation(999),           "mtmd+empty-map: pure @ past-end");
+
+    llama_tokens out = st.get_text_tokens_post_media();
+    CHECK(out.size() == 3,                             "mtmd+empty-map: tail returns all text");
+    CHECK(out[0] == 10 && out[1] == 20 && out[2] == 30, "mtmd+empty-map: tail content matches");
+}
+
+static void test_pure_text_continuation_semantics() {
+    // The contract: is_pure_text_continuation(from_idx) returns true iff there is NO image chunk
+    // extending past from_idx. We can verify the non-mtmd / empty-mtmd branches here (the
+    // with-image branch is exercised by integration tests once mtmd is wired up).
+    llama_tokens t = {7, 8, 9};
+    server_tokens st(t, false);
+
+    CHECK(st.is_pure_text_continuation(0),             "from_idx<size: true");
+    CHECK(st.is_pure_text_continuation(2),             "from_idx<size: true");
+    CHECK(st.is_pure_text_continuation(3),             "from_idx==size: true");
+    CHECK(st.is_pure_text_continuation(4),             "from_idx>size: true (past end)");
+    CHECK(st.is_pure_text_continuation(SIZE_MAX),      "from_idx=SIZE_MAX: true (past end)");
+}
+
+int main() {
+    test_non_mtmd_empty_buffer();
+    std::printf("[server_tokens] non_mtmd_empty_buffer            OK\n");
+
+    test_non_mtmd_text_only();
+    std::printf("[server_tokens] non_mtmd_text_only               OK\n");
+
+    test_mtmd_empty_chunks();
+    std::printf("[server_tokens] mtmd_empty_chunks                OK\n");
+
+    test_pure_text_continuation_semantics();
+    std::printf("[server_tokens] pure_text_continuation_semantics OK\n");
+
+    std::printf("ALL PASS — 4 test groups, server_tokens C.2.0 foundational API\n");
+    return 0;
+}
diff --git a/tests/test-speculative-mtp.cpp b/tests/test-speculative-mtp.cpp
index 54f4c34051a3..a50a83e09769 100644
--- a/tests/test-speculative-mtp.cpp
+++ b/tests/test-speculative-mtp.cpp
@@ -1,4 +1,5 @@
 #include "llama.h"
+#include "speculative.h"
 
 #include <algorithm>
 #include <cstdlib>
@@ -10,6 +11,13 @@
 // Set env vars to run non-skip paths; otherwise exits 0.
 
 int main() {
+    // Phase C.2.1 — contract smoke: common_speculative_reset / common_speculative_cancel
+    // must be safe no-ops on a null spec (matches the documented contract in speculative.h).
+    // Runs unconditionally — no model files required.
+    common_speculative_cancel(nullptr);
+    common_speculative_reset(nullptr);
+    std::cout << "[common_speculative] null-spec cancel + reset OK\n";
+
     const char * path_tgt  = std::getenv("LLAMA_MTP_TEST_TARGET");
     const char * path_head = std::getenv("LLAMA_MTP_TEST_HEAD");
     const char * path_bad  = std::getenv("LLAMA_MTP_TEST_BAD_ARCH");
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index e3f24390233b..b7bfe9f3ec7c 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -379,7 +379,15 @@ void server_tokens::push_back(server_tokens & tokens) {
 }
 
 void server_tokens::insert(const llama_tokens & inp_tokens) {
-    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    // Phase C.2.3 — appending text tokens at the tail is safe even when has_mtmd:
+    // map_idx_to_media is keyed by chunk start_idx, which is unaffected by tail appends.
+    // Speculative-accept path (server-context.cpp:3040) and slot-restore from file
+    // (server-context.cpp:1941) both append at the end. set_token() remains assert-guarded
+    // because position-overwrite could clobber an image chunk slot.
+    // Callers must pass only text tokens (no LLAMA_TOKEN_NULL) — debug-only check.
+#ifdef GGML_DEBUG
+    for (const auto t : inp_tokens) { GGML_ASSERT(t != LLAMA_TOKEN_NULL); }
+#endif
     tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
 }
 
@@ -388,6 +396,54 @@ const llama_tokens & server_tokens::get_text_tokens() const {
     return tokens;
 }
 
+// Phase C.2.0 — coexistence APIs (see header for contract).
+
+size_t server_tokens::last_image_end_idx() const {
+    if (!has_mtmd || map_idx_to_media.empty()) {
+        return 0;
+    }
+    // map_idx_to_media is std::map sorted by start idx; rbegin() is O(1).
+    auto last = map_idx_to_media.rbegin();
+    const size_t start_idx = last->first;
+    const size_t n_tokens  = mtmd_input_chunk_get_n_tokens(last->second.get());
+    return start_idx + n_tokens;
+}
+
+bool server_tokens::is_pure_text_continuation(size_t from_idx) const {
+    if (!has_mtmd || map_idx_to_media.empty()) {
+        return true;
+    }
+    return from_idx >= last_image_end_idx();
+}
+
+llama_tokens server_tokens::get_text_tokens_post_media() const {
+    if (!has_mtmd || map_idx_to_media.empty()) {
+        // Defensive: even in pure-text mode the buffer should not contain LLAMA_TOKEN_NULL,
+        // but strip just in case to keep the post-condition invariant uniform.
+        llama_tokens out;
+        out.reserve(tokens.size());
+        for (const auto & t : tokens) {
+            if (t != LLAMA_TOKEN_NULL) {
+                out.push_back(t);
+            }
+        }
+        return out;
+    }
+    const size_t start = last_image_end_idx();
+    llama_tokens out;
+    if (start >= tokens.size()) {
+        return out;
+    }
+    out.reserve(tokens.size() - start);
+    for (size_t i = start; i < tokens.size(); ++i) {
+        const llama_token t = tokens[i];
+        if (t != LLAMA_TOKEN_NULL) {
+            out.push_back(t);
+        }
+    }
+    return out;
+}
+
 void server_tokens::set_token(llama_pos pos, llama_token id) {
     GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
     tokens[pos] = id;
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 440ebc597af7..5470083129d7 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -189,6 +189,30 @@ struct server_tokens {
     // for compatibility with speculative decoding, ctx shift, slot save/load
     const llama_tokens & get_text_tokens() const;
 
+    // Phase C.2.0 — coexistence APIs for MTP + mmproj dispatch (foundational, no behavior change here).
+    //
+    // is_pure_text_continuation(from_idx) — O(log n) oracle:
+    //   "if a caller decodes starting at position from_idx, will all tokens through end-of-buffer
+    //    be pure text (no image chunks remaining)?"
+    //   Used by the server to gate per-batch MTP draft dispatch when mmproj is also loaded.
+    //   - !has_mtmd          → always true
+    //   - map empty          → always true
+    //   - from_idx >= last_image_end_idx() → true (we're past every image chunk)
+    //   - otherwise          → false (an image chunk still extends past from_idx)
+    bool is_pure_text_continuation(size_t from_idx) const;
+
+    // End-exclusive idx of the last image/audio chunk in the buffer (start + n_tokens).
+    // Returns 0 if there are no media chunks. !has_mtmd → 0.
+    size_t last_image_end_idx() const;
+
+    // Returns the suffix of text tokens after the last media chunk.
+    // - !has_mtmd          → returns a copy of all tokens
+    // - map empty          → returns a copy of all tokens
+    // - otherwise          → tokens[last_image_end_idx() ..] with any LLAMA_TOKEN_NULL stripped
+    // Returned by value because the underlying buffer may interleave images and the suffix is
+    // not a contiguous slice. Callers typically bind to a const ref of the temporary.
+    llama_tokens get_text_tokens_post_media() const;
+
     // for compatibility with speculative decoding
     void set_token(llama_pos pos, llama_token id);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index fea3ed875026..6abe8a8ebacc 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -59,6 +59,14 @@ struct server_slot {
 
     common_speculative * spec = nullptr;
 
+    // Phase C.2.3 — tracks whether the prior decode batch for this slot was inside an
+    // image-encoding phase (image tokens still pending in slot.prompt.tokens). Used by
+    // the per-batch MTP dispatch gate in the main inference loop: on the first batch
+    // where this is true but the new batch is pure-text continuation, common_speculative_reset
+    // is called to cold-restart MTP state. Only meaningful when slot has mctx + slot.spec
+    // (i.e. --allow-mtp-with-mmproj path).
+    bool mtp_was_image_phase = false;
+
     // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
     //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
     std::unique_ptr<const server_task> task;
@@ -734,8 +742,22 @@ struct server_context_impl {
             }
 
             if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
-                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
-                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+                // Phase C.2.2 — opt-in coexistence with multimodal. Default behavior (flag off)
+                // keeps the PR #17 SEGV safety net: disable speculative decoding entirely when
+                // mmproj is loaded. With --allow-mtp-with-mmproj set, the speculative state is
+                // kept alive and dispatch is gated per-batch in the main inference loop using
+                // server_tokens::is_pure_text_continuation.
+                if (!params_base.speculative.allow_mtp_with_mmproj) {
+                    params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
+                    // Also clear the draft model path so common_speculative_init does not
+                    // observe an orphan has_draft=true with type=NONE (would build a DRAFT
+                    // config and crash on ctx_dft=nullptr). See common/speculative.cpp init.
+                    params_base.speculative.mparams_dft.path.clear();
+                    params_base.speculative.model_dft = nullptr;
+                    SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+                } else {
+                    SRV_INF("%s\n", "speculative decoding kept enabled alongside multimodal (--allow-mtp-with-mmproj): per-batch gate active");
+                }
             }
         }
 
@@ -794,13 +816,19 @@ struct server_context_impl {
             if (can_spec) {
                 slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
                 if (slot.spec) {
-                    if (mctx) {
+                    if (mctx && !params_base.speculative.allow_mtp_with_mmproj) {
+                        // Phase C.2.2 — without the opt-in flag, refuse spec+mmproj coexistence
+                        // (PR #17 safety net). With the flag set, fall through: per-batch dispatch
+                        // in the main inference loop will gate MTP invocation based on whether the
+                        // current batch is pure-text continuation or contains image tokens.
                         SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
                         return false;
                     }
                     // MTP reads target's KV memory by sequence id; bind to slot.id (server uses slot.id as seq_id).
                     common_speculative_set_seq_id(slot.spec, slot.id);
-                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
+                    SLT_INF(slot, "%s%s\n",
+                            "speculative decoding context initialized",
+                            mctx ? " (gated per-batch for multimodal slot)" : "");
                 } else {
                     SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                 }
@@ -2112,13 +2140,44 @@ struct server_context_impl {
             // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
             //       perform the speculative drafting for all sequences at the same time in a single batch
             const int n_draft_max = slot.get_n_draft_max();
-            if (n_draft_max > 0) {
-                if (mctx) {
-                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
-                    GGML_ABORT("not supported by multimodal");
-                }
 
-                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
+            // Phase C.2.3 — per-batch MTP dispatch gate for multimodal slots.
+            // Possible runtime states for (mctx, slot.spec, can_mtp_now):
+            //   (null,    null,    -)     : no spec, no mmproj         → standard decode
+            //   (null,    non-null,true)  : spec, no mmproj            → MTP always (existing behavior)
+            //   (non-null,null,    -)     : mmproj, no spec            → standard decode (flag off path)
+            //   (non-null,non-null,true)  : mmproj+spec, pure-text now → MTP draft (NEW)
+            //   (non-null,non-null,false) : mmproj+spec, image pending → standard decode + reset on next text
+            const bool slot_has_mmproj   = (mctx != nullptr);
+            const bool slot_has_spec     = (slot.spec != nullptr);
+            const bool is_text_now       = !slot_has_mmproj
+                                        || slot.prompt.tokens.is_pure_text_continuation(slot.prompt.tokens.size());
+            const bool can_mtp_now       = slot_has_spec && is_text_now;
+            const bool image_to_text_now = slot_has_spec && slot_has_mmproj
+                                        && slot.mtp_was_image_phase && is_text_now;
+
+            // If we just transitioned from image-encoding to text continuation, cold-restart MTP
+            // state so the assistant head starts from a clean slate. The next few draft tokens
+            // incur warmup cost but state desync is impossible (Option 2 from design brief §6).
+            if (image_to_text_now) {
+                common_speculative_reset(slot.spec);
+                SLT_DBG(slot, "%s\n", "MTP state cold-restarted at image-to-text boundary");
+            }
+
+            // Update per-slot phase tracking for next iteration. Only meaningful when both
+            // mmproj and spec are live (the only case where image phases can re-occur).
+            if (slot_has_mmproj && slot_has_spec) {
+                slot.mtp_was_image_phase = !is_text_now;
+            }
+
+            if (n_draft_max > 0 && can_mtp_now) {
+                // Pure-text path: choose the cached-token view based on whether the slot has mmproj.
+                // For pure-text-only slots (no mmproj), get_text_tokens() returns the canonical reference.
+                // For multimodal slots in a pure-text continuation, the suffix after the last image is what
+                // MTP should reason about. Bound the temporary's lifetime to the rest of the block via
+                // const-ref binding.
+                const llama_tokens   mctx_view = slot_has_mmproj ? slot.prompt.tokens.get_text_tokens_post_media() : llama_tokens{};
+                const llama_tokens & cached_text_tokens = slot_has_mmproj ? mctx_view : slot.prompt.tokens.get_text_tokens();
 
                 const auto & params_spec = slot.task->params.speculative;
 
@@ -2878,7 +2937,16 @@ struct server_context_impl {
                     slot.state = SLOT_STATE_GENERATING;
 
                     if (slot.can_speculate()) {
-                        common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
+                        // Phase C.2.3 — multimodal-safe begin: get_text_tokens() asserts !has_mtmd,
+                        // so when mmproj is loaded use the post-media view (suffix after last image).
+                        // Matches the pattern used in the per-batch dispatch hot path below.
+                        const llama_tokens   mctx_view_begin = slot.prompt.tokens.has_mtmd
+                                                             ? slot.prompt.tokens.get_text_tokens_post_media()
+                                                             : llama_tokens{};
+                        const llama_tokens & tokens_for_begin = slot.prompt.tokens.has_mtmd
+                                                              ? mctx_view_begin
+                                                              : slot.prompt.tokens.get_text_tokens();
+                        common_speculative_begin(slot.spec, tokens_for_begin);
                     }
                 } else if (slot.state != SLOT_STATE_GENERATING) {
                     continue; // continue loop of slots