@@ -651,7 +651,6 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_csa_lid_attention(
651651 const llama_kv_cache_context * mctx_swa = inp_attn->mctx ->get_swa ();
652652
653653 ggml_build_forward_expand (gf, mctx_swa->cpy_k (ctx0, kv, inp_attn->get_k_idxs_swa (), il));
654- ggml_build_forward_expand (gf, mctx_swa->cpy_v (ctx0, kv, inp_attn->get_v_idxs_swa (), il));
655654
656655 ggml_tensor * raw_k = mctx_swa->get_k (ctx0, il);
657656 if (raw_k->type != GGML_TYPE_F32) {
@@ -709,7 +708,6 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hca_attention(
709708 const llama_kv_cache_context * mctx_swa = inp_attn->mctx ->get_swa ();
710709
711710 ggml_build_forward_expand (gf, mctx_swa->cpy_k (ctx0, kv, inp_attn->get_k_idxs_swa (), il));
712- ggml_build_forward_expand (gf, mctx_swa->cpy_v (ctx0, kv, inp_attn->get_v_idxs_swa (), il));
713711
714712 ggml_tensor * raw_k = mctx_swa->get_k (ctx0, il);
715713 if (raw_k->type != GGML_TYPE_F32) {
@@ -748,6 +746,42 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hca_attention(
748746 return out;
749747}
750748
749+ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_raw_attention (
750+ llm_graph_input_attn_kv_iswa * inp_attn,
751+ ggml_tensor * q,
752+ ggml_tensor * kv,
753+ ggml_tensor * sinks,
754+ float kq_scale,
755+ int il) const {
756+ const bool is_swa = hparams.is_swa (il);
757+
758+ ggml_tensor * k_rot = is_swa ? inp_attn->self_k_rot_swa : inp_attn->self_k_rot ;
759+ ggml_tensor * v_rot = is_swa ? inp_attn->self_v_rot_swa : inp_attn->self_v_rot ;
760+ GGML_ASSERT (v_rot == nullptr );
761+
762+ if (k_rot) {
763+ q = ggml_mul_mat (ctx0, k_rot, q);
764+ kv = ggml_mul_mat (ctx0, k_rot, kv);
765+ }
766+
767+ ggml_build_forward_expand (gf, q);
768+ ggml_build_forward_expand (gf, kv);
769+
770+ const llama_kv_cache_context * mctx_cur = is_swa ? inp_attn->mctx ->get_swa () : inp_attn->mctx ->get_base ();
771+ const auto & k_idxs = is_swa ? inp_attn->get_k_idxs_swa () : inp_attn->get_k_idxs ();
772+
773+ ggml_build_forward_expand (gf, mctx_cur->cpy_k (ctx0, kv, k_idxs, il));
774+
775+ const auto & kq_mask = is_swa ? inp_attn->get_kq_mask_swa () : inp_attn->get_kq_mask ();
776+
777+ ggml_tensor * k = mctx_cur->get_k (ctx0, il);
778+
779+ ggml_tensor * out = build_attn_mha (q, k, k, nullptr , kq_mask, sinks, nullptr , kq_scale, il);
780+ cb (out, " attn_raw" , il);
781+
782+ return out;
783+ }
784+
751785ggml_tensor * llama_model_deepseek_v4_flash::graph::build_attention (
752786 const llama_model & model,
753787 llm_graph_input_dsv4 * inp_dsv4,
@@ -1021,11 +1055,8 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_attention(
10211055 out = build_hca_attention (inp_dsv4, inp_attn, q, kv, layer.attn_sinks ,
10221056 1 .0f /sqrtf (float (n_embd_head)), il);
10231057 } else {
1024- out = build_attn (inp_attn,
1025- nullptr , nullptr , nullptr ,
1026- q, kv, kv, nullptr , layer.attn_sinks , nullptr ,
1058+ out = build_raw_attention (inp_attn, q, kv, layer.attn_sinks ,
10271059 1 .0f /sqrtf (float (n_embd_head)), il);
1028- cb (out, " attn_raw" , il);
10291060 }
10301061
10311062 out = ggml_reshape_3d (ctx0, out, n_embd_head, n_head, nt);
0 commit comments