@@ -557,7 +557,7 @@ ggml_tensor * llama_model_deepseek4::graph::build_lid_top_k(
557557 cb (indexer_q_pe, " lid_q_pe" , il);
558558
559559 indexer_q = ggml_concat (ctx0, indexer_q_nope, indexer_q_pe, 0 );
560- indexer_q = ggml_mul_mat (ctx0, inp_lid.k_rot , indexer_q );
560+ indexer_q = llama_mul_mat_hadamard (ctx0, indexer_q, inp_lid.k_rot );
561561 cb (indexer_q, " lid_q_rot" , il);
562562
563563 ggml_tensor * indexer_weights = build_lora_mm (layer.indexer_proj , cur);
@@ -652,10 +652,15 @@ ggml_tensor * llama_model_deepseek4::graph::build_csa_lid_attention(
652652 int il) const {
653653 const auto & inp_csa = inp_dsv4->get_csa ();
654654 GGML_ASSERT (inp_csa.kq_mask );
655- GGML_ASSERT (inp_attn->self_k_rot == nullptr );
656655
657656 ggml_tensor * top_k = build_lid_top_k (model, inp_dsv4, qr, cur, inp_pos, il);
658657
658+ ggml_tensor * k_rot = inp_attn->self_k_rot ;
659+ if (k_rot) {
660+ q = llama_mul_mat_hadamard (ctx0, q, k_rot);
661+ kv = llama_mul_mat_hadamard (ctx0, kv, k_rot);
662+ }
663+
659664 ggml_build_forward_expand (gf, q);
660665 ggml_build_forward_expand (gf, kv);
661666
@@ -696,6 +701,9 @@ ggml_tensor * llama_model_deepseek4::graph::build_csa_lid_attention(
696701
697702 ggml_tensor * kq_b = dsv4_build_kq_zero_bias (ctx0, cparams, kq_mask, q->ne [1 ]);
698703 ggml_tensor * out = build_attn_mha (q, k_all, k_all, kq_b, kq_mask, sinks, nullptr , kq_scale, il);
704+ if (k_rot) {
705+ out = llama_mul_mat_hadamard (ctx0, out, k_rot);
706+ }
699707 cb (out, " attn_csa_lid" , il);
700708
701709 return out;
@@ -711,7 +719,12 @@ ggml_tensor * llama_model_deepseek4::graph::build_hca_attention(
711719 int il) const {
712720 const auto & inp_hca = inp_dsv4->get_hca ();
713721 GGML_ASSERT (inp_hca.kq_mask );
714- GGML_ASSERT (inp_attn->self_k_rot == nullptr );
722+
723+ ggml_tensor * k_rot = inp_attn->self_k_rot ;
724+ if (k_rot) {
725+ q = llama_mul_mat_hadamard (ctx0, q, k_rot);
726+ kv = llama_mul_mat_hadamard (ctx0, kv, k_rot);
727+ }
715728
716729 ggml_build_forward_expand (gf, q);
717730 ggml_build_forward_expand (gf, kv);
@@ -753,6 +766,9 @@ ggml_tensor * llama_model_deepseek4::graph::build_hca_attention(
753766
754767 ggml_tensor * kq_b = dsv4_build_kq_zero_bias (ctx0, cparams, kq_mask, q->ne [1 ]);
755768 ggml_tensor * out = build_attn_mha (q, k_all, k_all, kq_b, kq_mask, sinks, nullptr , kq_scale, il);
769+ if (k_rot) {
770+ out = llama_mul_mat_hadamard (ctx0, out, k_rot);
771+ }
756772 cb (out, " attn_hca" , il);
757773
758774 return out;
@@ -770,8 +786,8 @@ ggml_tensor * llama_model_deepseek4::graph::build_raw_attention(
770786 ggml_tensor * k_rot = inp_attn->self_k_rot ;
771787
772788 if (k_rot) {
773- q = ggml_mul_mat (ctx0, k_rot, q );
774- kv = ggml_mul_mat (ctx0, k_rot, kv );
789+ q = llama_mul_mat_hadamard (ctx0, q, k_rot );
790+ kv = llama_mul_mat_hadamard (ctx0, kv, k_rot );
775791 }
776792
777793 ggml_build_forward_expand (gf, q);
@@ -788,6 +804,9 @@ ggml_tensor * llama_model_deepseek4::graph::build_raw_attention(
788804
789805 ggml_tensor * kq_b = dsv4_build_kq_zero_bias (ctx0, cparams, kq_mask, q->ne [1 ]);
790806 ggml_tensor * out = build_attn_mha (q, k, k, kq_b, kq_mask, sinks, nullptr , kq_scale, il);
807+ if (k_rot) {
808+ out = llama_mul_mat_hadamard (ctx0, out, k_rot);
809+ }
791810 cb (out, " attn_raw" , il);
792811
793812 return out;
@@ -917,6 +936,11 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
917936 " csa_state_compress" ,
918937 il);
919938
939+ if (inp_dsv4->get_csa ().k_rot ) {
940+ kv_comp_csa_state = llama_mul_mat_hadamard (ctx0, kv_comp_csa_state, inp_dsv4->get_csa ().k_rot );
941+ cb (kv_comp_csa_state, " csa_state_compress_rot" , il);
942+ }
943+
920944 ggml_build_forward_expand (gf, inp_dsv4->mctx ->get_csa ()->cpy_k (ctx0,
921945 kv_comp_csa_state, inp_dsv4->get_csa ().state_write_idxs , il));
922946
@@ -965,7 +989,7 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
965989 il);
966990
967991 if (inp_dsv4->get_lid ().k_rot ) {
968- kv_comp_lid_state = ggml_mul_mat (ctx0, inp_dsv4->get_lid ().k_rot , kv_comp_lid_state );
992+ kv_comp_lid_state = llama_mul_mat_hadamard (ctx0, kv_comp_lid_state, inp_dsv4->get_lid ().k_rot );
969993 cb (kv_comp_lid_state, " lid_state_compress_rot" , il);
970994 }
971995
@@ -1007,6 +1031,11 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
10071031 " hca_state_compress" ,
10081032 il);
10091033
1034+ if (inp_dsv4->get_hca ().k_rot ) {
1035+ kv_comp_hca = llama_mul_mat_hadamard (ctx0, kv_comp_hca, inp_dsv4->get_hca ().k_rot );
1036+ cb (kv_comp_hca, " hca_state_compress_rot" , il);
1037+ }
1038+
10101039 ggml_build_forward_expand (gf, inp_dsv4->mctx ->get_hca ()->cpy_k (ctx0,
10111040 kv_comp_hca, inp_dsv4->get_hca ().state_write_idxs , il));
10121041 hca_state_dep = kv_comp_hca;
@@ -1035,13 +1064,11 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
10351064 if (ratio == DSV4_CSA_RATIO &&
10361065 inp_dsv4->get_csa ().kq_mask &&
10371066 inp_dsv4->get_lid ().kq_mask &&
1038- inp_dsv4->get_lid ().k_rot &&
1039- inp_attn->self_k_rot == nullptr ) {
1067+ inp_dsv4->get_lid ().k_rot ) {
10401068 out = build_csa_lid_attention (model, inp_dsv4, inp_attn, q, kv, qr, cur, inp_pos, layer.attn_sinks ,
10411069 1 .0f /sqrtf (float (n_embd_head)), il);
10421070 } else if (ratio == DSV4_HCA_RATIO &&
1043- inp_dsv4->get_hca ().kq_mask &&
1044- inp_attn->self_k_rot == nullptr ) {
1071+ inp_dsv4->get_hca ().kq_mask ) {
10451072 out = build_hca_attention (inp_dsv4, inp_attn, q, kv, layer.attn_sinks ,
10461073 1 .0f /sqrtf (float (n_embd_head)), il);
10471074 } else {
0 commit comments