@@ -2011,6 +2011,7 @@ ggml_tensor * llm_graph_context::build_attn(
20112011 llm_graph_input_attn_no_cache * inp,
20122012 ggml_tensor * wo,
20132013 ggml_tensor * wo_b,
2014+ ggml_tensor * wo_s,
20142015 ggml_tensor * q_cur,
20152016 ggml_tensor * k_cur,
20162017 ggml_tensor * v_cur,
@@ -2044,7 +2045,7 @@ ggml_tensor * llm_graph_context::build_attn(
20442045 cb (cur, " kqv_out" , il);
20452046
20462047 if (wo) {
2047- cur = build_lora_mm (wo, cur);
2048+ cur = build_lora_mm (wo, cur, wo_s );
20482049 }
20492050
20502051 if (wo_b) {
@@ -2095,6 +2096,7 @@ ggml_tensor * llm_graph_context::build_attn(
20952096 llm_graph_input_attn_kv * inp,
20962097 ggml_tensor * wo,
20972098 ggml_tensor * wo_b,
2099+ ggml_tensor * wo_s,
20982100 ggml_tensor * q_cur,
20992101 ggml_tensor * k_cur,
21002102 ggml_tensor * v_cur,
@@ -2146,10 +2148,15 @@ ggml_tensor * llm_graph_context::build_attn(
21462148 }
21472149
21482150 if (wo) {
2149- cur = build_lora_mm (wo, cur);
21502151 if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 ) {
21512152 // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
2153+ cur = build_lora_mm (wo, cur);
21522154 ggml_mul_mat_set_prec (cur, GGML_PREC_F32 );
2155+ if (wo_s) {
2156+ cur = ggml_mul (ctx0, cur, wo_s);
2157+ }
2158+ } else {
2159+ cur = build_lora_mm (wo, cur, wo_s);
21532160 }
21542161 }
21552162
@@ -2193,6 +2200,7 @@ ggml_tensor * llm_graph_context::build_attn(
21932200 llm_graph_input_attn_k * inp,
21942201 ggml_tensor * wo,
21952202 ggml_tensor * wo_b,
2203+ ggml_tensor * wo_s,
21962204 ggml_tensor * q_cur,
21972205 ggml_tensor * k_cur,
21982206 ggml_tensor * v_cur,
@@ -2227,10 +2235,15 @@ ggml_tensor * llm_graph_context::build_attn(
22272235 cb (cur, " kqv_out" , il);
22282236
22292237 if (wo) {
2230- cur = build_lora_mm (wo, cur);
22312238 if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE ) {
22322239 // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
2240+ cur = build_lora_mm (wo, cur);
22332241 ggml_mul_mat_set_prec (cur, GGML_PREC_F32 );
2242+ if (wo_s) {
2243+ cur = ggml_mul (ctx0, cur, wo_s);
2244+ }
2245+ } else {
2246+ cur = build_lora_mm (wo, cur, wo_s);
22342247 }
22352248 }
22362249
@@ -2245,6 +2258,7 @@ ggml_tensor * llm_graph_context::build_attn(
22452258 llm_graph_input_attn_kv_iswa * inp,
22462259 ggml_tensor * wo,
22472260 ggml_tensor * wo_b,
2261+ ggml_tensor * wo_s,
22482262 ggml_tensor * q_cur,
22492263 ggml_tensor * k_cur,
22502264 ggml_tensor * v_cur,
@@ -2313,7 +2327,7 @@ ggml_tensor * llm_graph_context::build_attn(
23132327 }
23142328
23152329 if (wo) {
2316- cur = build_lora_mm (wo, cur);
2330+ cur = build_lora_mm (wo, cur, wo_s );
23172331 }
23182332
23192333 if (wo_b) {
@@ -2344,6 +2358,7 @@ ggml_tensor * llm_graph_context::build_attn(
23442358 llm_graph_input_attn_cross * inp,
23452359 ggml_tensor * wo,
23462360 ggml_tensor * wo_b,
2361+ ggml_tensor * wo_s,
23472362 ggml_tensor * q_cur,
23482363 ggml_tensor * k_cur,
23492364 ggml_tensor * v_cur,
@@ -2368,7 +2383,7 @@ ggml_tensor * llm_graph_context::build_attn(
23682383 cb (cur, " kqv_out" , il);
23692384
23702385 if (wo) {
2371- cur = build_lora_mm (wo, cur);
2386+ cur = build_lora_mm (wo, cur, wo_s );
23722387 }
23732388
23742389 if (wo_b) {
0 commit comments