@@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
10881088 ggml_tensor * w_s) const {
10891089 ggml_tensor * res = ggml_mul_mat (ctx0, w, cur);
10901090
1091+ if (w_s) {
1092+ res = ggml_mul (ctx0, res, w_s);
1093+ }
1094+
10911095 for (const auto & lora : *loras) {
10921096 llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
10931097 if (lw == nullptr ) {
@@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
11061110 res = ggml_add (ctx0, res, ab_cur);
11071111 }
11081112
1109- if (w_s) {
1110- res = ggml_mul (ctx0, res, w_s);
1111- }
1112-
11131113 return res;
11141114}
11151115
11161116ggml_tensor * llm_graph_context::build_lora_mm_id (
11171117 ggml_tensor * w, // ggml_tensor * as
11181118 ggml_tensor * cur, // ggml_tensor * b
1119- ggml_tensor * ids) const {
1119+ ggml_tensor * ids,
1120+ ggml_tensor * w_s) const {
11201121 ggml_tensor * res = ggml_mul_mat_id (ctx0, w, cur, ids);
1122+
1123+ if (w_s) {
1124+ const int64_t n_expert = w_s->ne [0 ];
1125+ const int64_t n_tokens = cur->ne [2 ];
1126+ ggml_tensor * s = ggml_reshape_3d (ctx0, w_s, 1 , n_expert, 1 );
1127+ s = ggml_repeat_4d (ctx0, s, 1 , n_expert, n_tokens, 1 );
1128+ s = ggml_get_rows (ctx0, s, ids);
1129+ res = ggml_mul (ctx0, res, s);
1130+ }
11211131 for (const auto & lora : *loras) {
11221132 llama_adapter_lora_weight * lw = lora.first ->get_weight (w);
11231133 if (lw == nullptr ) {
@@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
12691279 llm_ffn_op_type type_op,
12701280 llm_ffn_gate_type type_gate,
12711281 int il) const {
1282+ // NVFP4 support is currently restricted to
1283+ // 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
1284+ // 2) bias absense (*_s would be applied after bias addition, which is incorrect)
1285+ // TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
1286+ auto has_lora = [this ](ggml_tensor * w) {
1287+ if (!w) {
1288+ return false ;
1289+ }
1290+ for (const auto & lora : *loras) {
1291+ if (lora.first ->get_weight (w) != nullptr ) {
1292+ return true ;
1293+ }
1294+ }
1295+ return false ;
1296+ };
1297+
1298+ GGML_ASSERT (!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4 );
1299+ GGML_ASSERT (!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4 );
1300+ GGML_ASSERT (!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4 );
1301+ GGML_ASSERT (!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora (up));
1302+ GGML_ASSERT (!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora (gate));
1303+ GGML_ASSERT (!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora (down));
1304+
12721305 ggml_tensor * tmp = up ? build_lora_mm (up, cur) : cur;
12731306 cb (tmp, " ffn_up" , il);
12741307
@@ -1627,67 +1660,52 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
16271660
16281661 if (gate_up_exps) {
16291662 // merged gate_up path: one mul_mat_id, then split into gate and up views
1630- ggml_tensor * gate_up = build_lora_mm_id (gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
1663+ ggml_tensor * gate_up = build_lora_mm_id (gate_up_exps, cur, selected_experts, up_exps_s ); // [n_ff*2, n_expert_used, n_tokens]
16311664 cb (gate_up, " ffn_moe_gate_up" , il);
16321665
1666+ if (up_exps_s) {
1667+ cb (gate_up, " ffn_moe_gate_up_scaled" , il);
1668+ }
1669+
16331670 if (gate_up_exps_b) {
16341671 gate_up = ggml_add_id (ctx0, gate_up, gate_up_exps_b, selected_experts);
16351672 cb (gate_up, " ffn_moe_gate_up_biased" , il);
16361673 }
16371674
1638- // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
1639- if (up_exps_s) {
1640- ggml_tensor * s = ggml_reshape_3d (ctx0, up_exps_s, 1 , n_expert, 1 );
1641- s = ggml_repeat_4d (ctx0, s, 1 , n_expert, n_tokens, 1 );
1642- s = ggml_get_rows (ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1643- gate_up = ggml_mul (ctx0, gate_up, s);
1644- cb (gate_up, " ffn_moe_gate_up_scaled" , il);
1645- }
1646-
16471675 const int64_t n_ff = gate_up->ne [0 ] / 2 ;
16481676 cur = ggml_view_3d (ctx0, gate_up, n_ff, gate_up->ne [1 ], gate_up->ne [2 ], gate_up->nb [1 ], gate_up->nb [2 ], 0 );
16491677 cb (cur, " ffn_moe_gate" , il);
16501678 up = ggml_view_3d (ctx0, gate_up, n_ff, gate_up->ne [1 ], gate_up->ne [2 ], gate_up->nb [1 ], gate_up->nb [2 ], n_ff * gate_up->nb [0 ]);
16511679 cb (up, " ffn_moe_up" , il);
16521680 } else {
16531681 // separate gate and up path
1654- up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
1682+ up = build_lora_mm_id (up_exps, cur, selected_experts, up_exps_s ); // [n_ff, n_expert_used, n_tokens]
16551683 cb (up, " ffn_moe_up" , il);
16561684
1685+ if (up_exps_s) {
1686+ cb (up, " ffn_moe_up_scaled" , il);
1687+ }
1688+
16571689 if (up_exps_b) {
16581690 up = ggml_add_id (ctx0, up, up_exps_b, selected_experts);
16591691 cb (up, " ffn_moe_up_biased" , il);
16601692 }
16611693
1662- // apply per-expert scale2 to up
1663- if (up_exps_s) {
1664- ggml_tensor * s = ggml_reshape_3d (ctx0, up_exps_s, 1 , n_expert, 1 );
1665- s = ggml_repeat_4d (ctx0, s, 1 , n_expert, n_tokens, 1 );
1666- s = ggml_get_rows (ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1667- up = ggml_mul (ctx0, up, s);
1668- cb (up, " ffn_moe_up_scaled" , il);
1669- }
1670-
16711694 if (gate_exps) {
1672- cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
1695+ cur = build_lora_mm_id (gate_exps, cur, selected_experts, gate_exps_s ); // [n_ff, n_expert_used, n_tokens]
16731696 cb (cur, " ffn_moe_gate" , il);
16741697 } else {
16751698 cur = up;
16761699 }
16771700
1701+ if (gate_exps_s) {
1702+ cb (cur, " ffn_moe_gate_scaled" , il);
1703+ }
1704+
16781705 if (gate_exps_b) {
16791706 cur = ggml_add_id (ctx0, cur, gate_exps_b, selected_experts);
16801707 cb (cur, " ffn_moe_gate_biased" , il);
16811708 }
1682-
1683- // apply per-expert scale2 to gate
1684- if (gate_exps_s) {
1685- ggml_tensor * s = ggml_reshape_3d (ctx0, gate_exps_s, 1 , n_expert, 1 );
1686- s = ggml_repeat_4d (ctx0, s, 1 , n_expert, n_tokens, 1 );
1687- s = ggml_get_rows (ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1688- cur = ggml_mul (ctx0, cur, s);
1689- cb (cur, " ffn_moe_gate_scaled" , il);
1690- }
16911709 }
16921710
16931711 const bool has_gate = gate_exps || gate_up_exps;
@@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
17591777 GGML_ABORT (" fatal error" );
17601778 }
17611779
1762- experts = build_lora_mm_id (down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1780+ experts = build_lora_mm_id (down_exps, cur, selected_experts, down_exps_s ); // [n_embd, n_expert_used, n_tokens]
17631781 cb (experts, " ffn_moe_down" , il);
17641782
1783+ if (down_exps_s) {
1784+ cb (experts, " ffn_moe_down_scaled" , il);
1785+ }
1786+
17651787 if (down_exps_b) {
17661788 experts = ggml_add_id (ctx0, experts, down_exps_b, selected_experts);
17671789 cb (experts, " ffn_moe_down_biased" , il);
17681790 }
17691791
1770- // apply per-expert scale2 to down
1771- if (down_exps_s) {
1772- ggml_tensor * s = ggml_reshape_3d (ctx0, down_exps_s, 1 , n_expert, 1 );
1773- s = ggml_repeat_4d (ctx0, s, 1 , n_expert, n_tokens, 1 );
1774- s = ggml_get_rows (ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1775- experts = ggml_mul (ctx0, experts, s);
1776- cb (experts, " ffn_moe_down_scaled" , il);
1777- }
1778-
17791792 if (!weight_before_ffn) {
17801793 experts = ggml_mul (ctx0, experts, weights);
17811794 cb (experts, " ffn_moe_weighted" , il);
0 commit comments