am17an
diff --git a/‎common/common.cpp‎
Lines changed: 11 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 2 additions & 1 deletion b/‎ggml/include/ggml.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 3 additions & 1 deletion b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 19 additions & 7 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎ggml/src/ggml-cuda/gated_delta_net.cu‎
Lines changed: 42 additions & 17 deletions b/‎ggml/src/ggml-cuda/gated_delta_net.cu‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 2 additions & 1 deletion b/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ggml/src/ggml.c‎
Lines changed: 7 additions & 4 deletions b/‎ggml/src/ggml.c‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎include/llama.h‎
Lines changed: 4 additions & 0 deletions b/‎include/llama.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/llama-arch.cpp‎
Lines changed: 10 additions & 0 deletions b/‎src/llama-arch.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions b/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions
@@ -1422,6 +1422,11 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
 
     // try to remove the last tokens
     if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
+        if (llama_n_rollback_max(ctx) > 0 &&
+            llama_model_supports_recurrent_partial_rollback(llama_get_model(ctx))) {
+            res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
+            goto done;
+        }
         LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
         res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
         goto done;
@@ -1490,6 +1495,12 @@ struct llama_context_params common_context_params_to_llama(const common_params &
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_seq_max         = params.n_parallel;
+    {
+        // TODO: add for MTP
+        const bool has_spec = (params.speculative.type != COMMON_SPECULATIVE_TYPE_NONE)
+                              || params.speculative.has_dft();
+        cparams.n_rollback_max = has_spec ? (uint32_t) params.speculative.draft.n_max : 0u;
+    }
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.cpuparams.n_threads;
 
@@ -2537,7 +2537,8 @@ extern "C" {
             struct ggml_tensor  * v,
             struct ggml_tensor  * g,
             struct ggml_tensor  * beta,
-            struct ggml_tensor  * state);
+            struct ggml_tensor  * state,
+            bool                  keep_intermediates);
 
     // custom operators
 
 
@@ -2933,7 +2933,9 @@ struct ggml_cplan ggml_graph_plan(
                 case GGML_OP_GATED_DELTA_NET:
                     {
                         const int64_t S_v = node->src[2]->ne[0];
-                        cur = S_v * sizeof(float) * n_tasks;
+                        const bool keep_intermediates = (((const int32_t *) node->op_params)[0] != 0);
+                        const int64_t per_thread = S_v + (keep_intermediates ? S_v * S_v : 0);
+                        cur = per_thread * sizeof(float) * n_tasks;
                     } break;
                 case GGML_OP_COUNT:
                     {
 
@@ -10467,16 +10467,20 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
 
     const bool kda = (neg0 == S_v);
 
-    // scratch layout per thread: [delta(S_v)]
-    const int64_t scratch_per_thread = S_v;
+    const bool keep_intermediates = (const bool) ggml_get_op_params_i32(dst, 0);
+
+    const int64_t per_thread = S_v + (keep_intermediates ? S_v * S_v : 0);
     const int ith = params->ith;
 
-    float * delta = (float *)params->wdata + ith * scratch_per_thread + CACHE_LINE_SIZE_F32;
+    float * delta       = (float *)params->wdata + ith * per_thread + CACHE_LINE_SIZE_F32;
+    float * state_work  = keep_intermediates ? (delta + S_v) : nullptr;
 
     // output layout: [attn_scores | new_states]
     // attn_scores: S_v * H * n_tokens * n_seqs floats
-    // new_states:  S_v * S_v * H * n_seqs floats
-    const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
+    // new_states:  S_v * S_v * H * n_seqs     floats          (final only)
+    //              S_v * S_v * H * n_seqs * T floats (T snaps, keep_intermediates)
+    const int64_t attn_score_elems    = S_v * H * n_tokens * n_seqs;
+    const int64_t state_size_per_snap = S_v * S_v * H * n_seqs;
     float * attn_out_base  = (float *)dst->data;
     float * state_out_base = (float *)dst->data + attn_score_elems;
 
@@ -10499,9 +10503,11 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
         const int64_t iq3 = iv3 / rq3;
         const int64_t ik3 = iv3 / rk3;
 
-        float * s_out = state_out_base + (iv3 * H + iv1) * S_v * S_v;
+        float * s_out = keep_intermediates
+            ? state_work
+            : state_out_base + (iv3 * H + iv1) * S_v * S_v;
 
-        // copy input state into output buffer and operate in-place
+        // copy input state into the working buffer and operate in-place
         const float * s_in = state_in_base + (iv3 * H + iv1) * S_v * S_v;
         memcpy(s_out, s_in, S_v * S_v * sizeof(float));
 
@@ -10552,6 +10558,12 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
             }
 
             attn_data += S_v * H; // advance to next token
+
+            if (keep_intermediates) {
+                float * curr_state_o = state_out_base + t * state_size_per_snap +
+                                 (iv3 * H + iv1) * S_v * S_v;
+                memcpy(curr_state_o, s_out, S_v * S_v * sizeof(float));
+            }
         }
     }
 }
 
@@ -1,6 +1,6 @@
 #include "gated_delta_net.cuh"
 
-template <int S_v, bool KDA>
+template <int S_v, bool KDA, bool keep_intermediates_t>
 __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
 gated_delta_net_cuda(const float * q,
                                      const float * k,
@@ -37,7 +37,8 @@ gated_delta_net_cuda(const float * q,
     float *       attn_data        = dst;
     float *       state            = dst + attn_score_elems;
 
-    const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
+    const int64_t state_offset       = (sequence * H + h_idx) * S_v * S_v;
+    const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // keep_intermediates_t only
     state += state_offset;
     curr_state += state_offset + col * S_v;
     attn_data += (sequence * n_tokens * H + h_idx) * S_v;
@@ -135,17 +136,27 @@ gated_delta_net_cuda(const float * q,
         }
 
         attn_data += S_v * H;
+
+        if constexpr (keep_intermediates_t) {
+            float * curr_state = (dst + attn_score_elems) + t * state_size_per_token + state_offset;
+#pragma unroll
+            for (int r = 0; r < rows_per_lane; r++) {
+                const int i = r * warp_size + lane;
+                curr_state[col * S_v + i] = s_shard[r];
+            }
+        }
     }
 
-    // Write state back to global memory (transposed layout)
+    if constexpr (!keep_intermediates_t) {
 #pragma unroll
-    for (int r = 0; r < rows_per_lane; r++) {
-        const int i          = r * warp_size + lane;
-        state[col * S_v + i] = s_shard[r];
+        for (int r = 0; r < rows_per_lane; r++) {
+            const int i          = r * warp_size + lane;
+            state[col * S_v + i] = s_shard[r];
+        }
     }
 }
 
-template <bool KDA>
+template <bool KDA, bool keep_intermediates_t>
 static void launch_gated_delta_net(
         const float * q_d, const float * k_d, const float * v_d,
         const float * g_d, const float * b_d, const float * s_d,
@@ -169,26 +180,26 @@ static void launch_gated_delta_net(
 
     switch (S_v) {
         case 16:
-            gated_delta_net_cuda<16, KDA><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<16, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
             break;
         case 32:
-            gated_delta_net_cuda<32, KDA><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<32, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
             break;
         case 64: {
-            gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<64, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
             break;
         }
         case 128: {
-            gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<128, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
@@ -261,13 +272,27 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     cudaStream_t stream = ctx.stream();
 
+    const bool keep_intermediates = (((const int32_t *) dst->op_params)[0] != 0);
+
     if (kda) {
-        launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_intermediates) {
+            launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        } else {
+            launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        }
     } else {
-        launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_intermediates) {
+            launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        } else {
+            launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        }
     }
 }
@@ -16717,8 +16717,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
             tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
             src_clone[4], src_clone[5], src_clone[6]);
         } else if (tensor->op == GGML_OP_GATED_DELTA_NET) {
+            const bool keep_intermediates = (((const int32_t *) tensor->op_params)[0] != 0);
             tensor_clone = ggml_gated_delta_net(ggml_ctx, src_clone[0], src_clone[1],
-            src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
+            src_clone[2], src_clone[3], src_clone[4], src_clone[5], keep_intermediates);
         } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
             src_clone[0]->flags = tensor->src[0]->flags;
             tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
 
@@ -6171,7 +6171,8 @@ struct ggml_tensor * ggml_gated_delta_net(
         struct ggml_tensor  * v,
         struct ggml_tensor  * g,
         struct ggml_tensor  * beta,
-        struct ggml_tensor  * state) {
+        struct ggml_tensor  * state,
+        bool                  keep_intermediates) {
     GGML_ASSERT(ggml_is_contiguous_rows(q));
     GGML_ASSERT(ggml_is_contiguous_rows(k));
     GGML_ASSERT(ggml_is_contiguous_rows(v));
@@ -6197,9 +6198,8 @@ struct ggml_tensor * ggml_gated_delta_net(
 
     GGML_ASSERT(ggml_nelements(state) == S_v * S_v * H * n_seqs);
 
-    // concat output and new_state into a single tensor
-    // output: S_v * H * n_tokens * n_seqs, state: S_v * S_v * H * n_seqs
-    const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + S_v * n_seqs, 1, 1 };
+    const int64_t state_rows = keep_intermediates ? n_tokens * S_v * n_seqs : S_v * n_seqs;
+    const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + state_rows, 1, 1 };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     result->op     = GGML_OP_GATED_DELTA_NET;
@@ -6210,6 +6210,9 @@ struct ggml_tensor * ggml_gated_delta_net(
     result->src[4] = beta;
     result->src[5] = state;
 
+    int32_t flag = keep_intermediates ? 1 : 0;
+    ggml_set_op_params(result, &flag, sizeof(flag));
+
     return result;
 }
 
 
@@ -333,6 +333,7 @@ extern "C" {
         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        uint32_t n_rollback_max;    // max recurrent-state rollback distance (0 = no rollback support)
         int32_t  n_threads;         // number of threads to use for generation
         int32_t  n_threads_batch;   // number of threads to use for batch processing
 
@@ -530,6 +531,7 @@ extern "C" {
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_rollback_max (const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
     DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
@@ -621,6 +623,8 @@ extern "C" {
     // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
     LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 
+    LLAMA_API bool llama_model_supports_recurrent_partial_rollback(const struct llama_model * model);
+
     // Returns 0 on success
     LLAMA_API uint32_t llama_model_quantize(
             const char * fname_inp,
 
@@ -876,6 +876,16 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
     }
 }
 
+bool llm_arch_supports_recurrent_partial_rollback(const llm_arch & arch) {
+    switch (arch) {
+        case LLM_ARCH_QWEN35:
+        case LLM_ARCH_QWEN35MOE:
+            return true;
+        default:
+            return false;
+    }
+}
+
 bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
     switch (arch) {
         case LLM_ARCH_GROK:
 
@@ -636,3 +636,4 @@ bool llm_arch_is_recurrent      (const llm_arch & arch);
 bool llm_arch_is_hybrid         (const llm_arch & arch);
 bool llm_arch_is_diffusion      (const llm_arch & arch);
 bool llm_arch_supports_sm_tensor(const llm_arch & arch);
+bool llm_arch_supports_recurrent_partial_rollback(const llm_arch & arch);
Original file line number	Diff line number	Diff line change
`@@ -2933,7 +2933,9 @@ struct ggml_cplan ggml_graph_plan(`
`2933`	`2933`	`case GGML_OP_GATED_DELTA_NET:`
`2934`	`2934`	`{`
`2935`	`2935`	`const int64_t S_v = node->src[2]->ne[0];`
`2936`		`- cur = S_v * sizeof(float) * n_tasks;`
	`2936`	`+ const bool keep_intermediates = (((const int32_t *) node->op_params)[0] != 0);`
	`2937`	`+ const int64_t per_thread = S_v + (keep_intermediates ? S_v * S_v : 0);`
	`2938`	`+ cur = per_thread * sizeof(float) * n_tasks;`
`2937`	`2939`	`} break;`
`2938`	`2940`	`case GGML_OP_COUNT:`
`2939`	`2941`	`{`