review: rename, add asserts

am17an · am17an · commit a8a33f6307bf · 2026-05-14T17:41:35.000+08:00
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -1,6 +1,6 @@
 #include "gated_delta_net.cuh"
 
-template <int S_v, bool KDA, bool keep_intermediates_t>
+template <int S_v, bool KDA, bool keep_rs_t>
 __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
 gated_delta_net_cuda(const float * q,
                                      const float * k,
@@ -145,7 +145,7 @@ gated_delta_net_cuda(const float * q,
 
         attn_data += S_v * H;
 
-        if constexpr (keep_intermediates_t) {
+        if constexpr (keep_rs_t) {
             const int target_slot = t - shift;
             if (target_slot >= 0 && target_slot < K) {
                 float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
@@ -158,7 +158,7 @@ gated_delta_net_cuda(const float * q,
         }
     }
 
-    if constexpr (!keep_intermediates_t) {
+    if constexpr (!keep_rs_t) {
 #pragma unroll
         for (int r = 0; r < rows_per_lane; r++) {
             const int i          = r * warp_size + lane;
@@ -167,7 +167,7 @@ gated_delta_net_cuda(const float * q,
     }
 }
 
-template <bool KDA, bool keep_intermediates_t>
+template <bool KDA, bool keep_rs_t>
 static void launch_gated_delta_net(
         const float * q_d, const float * k_d, const float * v_d,
         const float * g_d, const float * b_d, const float * s_d,
@@ -191,26 +191,26 @@ static void launch_gated_delta_net(
 
     switch (S_v) {
         case 16:
-            gated_delta_net_cuda<16, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<16, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         case 32:
-            gated_delta_net_cuda<32, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<32, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         case 64: {
-            gated_delta_net_cuda<64, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<64, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
             break;
         }
         case 128: {
-            gated_delta_net_cuda<128, KDA, keep_intermediates_t><<<grid_dims, block_dims, 0, stream>>>(
+            gated_delta_net_cuda<128, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                 n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
@@ -285,10 +285,10 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
     const int K = (int) src_state->ne[1];
-    const bool keep_intermediates = K > 1;
+    const bool keep_rs = K > 1;
 
     if (kda) {
-        if (keep_intermediates) {
+        if (keep_rs) {
             launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
                 S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
@@ -298,7 +298,7 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
                 sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
         }
     } else {
-        if (keep_intermediates) {
+        if (keep_rs) {
             launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
                 S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                 sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -878,7 +878,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
     }
 }
 
-bool llm_arch_supports_recurrent_partial_rollback(const llm_arch & arch) {
+bool llm_arch_supports_rs_rollback(const llm_arch & arch) {
     switch (arch) {
         case LLM_ARCH_QWEN35:
         case LLM_ARCH_QWEN35MOE:
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -637,4 +637,4 @@ bool llm_arch_is_recurrent      (const llm_arch & arch);
 bool llm_arch_is_hybrid         (const llm_arch & arch);
 bool llm_arch_is_diffusion      (const llm_arch & arch);
 bool llm_arch_supports_sm_tensor(const llm_arch & arch);
-bool llm_arch_supports_recurrent_partial_rollback(const llm_arch & arch);
+bool llm_arch_supports_rs_rollback(const llm_arch & arch);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -52,7 +52,7 @@ llama_context::llama_context(
     }
 
     cparams.n_rs_seq = params.n_rs_seq;
-    if (cparams.n_rs_seq > 0 && !llm_arch_supports_recurrent_partial_rollback(model.arch)) {
+    if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) {
         LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n",
                         __func__, cparams.n_rs_seq);
         cparams.n_rs_seq = 0;
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -170,12 +170,10 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
             // partial rollback via per-token snapshot index (bounded by n_rs_seq)
             if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
                 const llama_pos rollback = cell.pos - (p0 - 1);
-                if (rollback >= 1 && rollback <= (llama_pos) n_rs_seq) {
-                    set_rs_idx(seq_id, (uint32_t) rollback);
-                    cell.pos = p0 - 1;
-                    return true;
-                }
-                return false;
+                GGML_ASSERT(rollback >= 1 && rollback <= (llama_pos) n_rs_seq);
+                set_rs_idx(seq_id, (uint32_t) rollback);
+                cell.pos = p0 - 1;
+                return true;
             }
             // invalidate tails which will be cleared
             if (p0 <= cell.pos && cell.pos < p1) {
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
@@ -447,7 +447,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
     return build_delta_net_chunking(q, k, v, g, b, s, il);
 }
 
-bool llm_build_delta_net_base::keep_intermediates() const {
+bool llm_build_delta_net_base::keep_rs() const {
     const int64_t n_seq_tokens = ubatch.n_seq_tokens;
     return cparams.n_rs_seq > 0
         && n_seq_tokens > 1
@@ -466,7 +466,7 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
     const uint32_t mem_size = mctx_cur->get_size();
     const int64_t n_seqs       = ubatch.n_seqs;
     const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    const bool    keep         = keep_intermediates();
+    const bool    keep         = keep_rs();
 
     ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
     cb(conv_states, "conv_states", il);
@@ -531,7 +531,7 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
     const int64_t n_seqs       = s->ne[3];
     const int64_t n_seq_tokens = q->ne[2];
 
-    if (!keep_intermediates()) {
+    if (!keep_rs()) {
         auto attn_out = build_delta_net(q, k, v, g, b, s, il);
         ggml_tensor * output    = attn_out.first;
         ggml_tensor * new_state = attn_out.second;
diff --git a/src/models/models.h b/src/models/models.h
@@ -67,7 +67,7 @@ struct llm_build_delta_net_base : public llm_graph_context {
                         int   il);
 
     // true when speculative rollback is enabled and the batch fits in the rs cache
-    bool keep_intermediates() const;
+    bool keep_rs() const;
 
     // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token)
     // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs)

Original file line number	Diff line number	Diff line change
`@@ -878,7 +878,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {`
`878`	`878`	`}`
`879`	`879`	`}`
`880`	`880`
`881`		`-bool llm_arch_supports_recurrent_partial_rollback(const llm_arch & arch) {`
	`881`	`+bool llm_arch_supports_rs_rollback(const llm_arch & arch) {`
`882`	`882`	`switch (arch) {`
`883`	`883`	`case LLM_ARCH_QWEN35:`
`884`	`884`	`case LLM_ARCH_QWEN35MOE:`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ llama_context::llama_context(`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`cparams.n_rs_seq = params.n_rs_seq;`
`55`		`- if (cparams.n_rs_seq > 0 && !llm_arch_supports_recurrent_partial_rollback(model.arch)) {`
	`55`	`+ if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) {`
`56`	`56`	`LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n",`
`57`	`57`	`__func__, cparams.n_rs_seq);`
`58`	`58`	`cparams.n_rs_seq = 0;`