Skip to content

Commit 33d3fc5

Browse files
committed
Wire NVFP4 weight_scales and input_scales everywhere
This was done mechanically with the help of AI
1 parent 26de66e commit 33d3fc5

108 files changed

Lines changed: 920 additions & 318 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/models/afmoe.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
151151
n_embd_head, n_head, n_head_kv, il);
152152

153153
// compute gate from input
154-
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
154+
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp, model.layers[il].wqkv_gate_s, model.layers[il].wqkv_gate_in_s);
155155
cb(gate, "attn_gate_proj", il);
156156

157157
// Q/K normalization
@@ -186,7 +186,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
186186
cb(cur, "attn_gated", il);
187187

188188
// now apply output projection
189-
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
189+
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s, model.layers[il].wo_in_s);
190190
cb(cur, "attn_o_proj", il);
191191
}
192192

@@ -224,7 +224,15 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
224224
hparams.expert_weights_norm, // norm_w (route_norm=True)
225225
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
226226
(llama_expert_gating_func_type) hparams.expert_gating_func,
227-
il);
227+
il,
228+
nullptr,
229+
nullptr,
230+
model.layers[il].ffn_up_exps_s,
231+
model.layers[il].ffn_gate_exps_s,
232+
model.layers[il].ffn_down_exps_s,
233+
model.layers[il].ffn_up_exps_in_s,
234+
model.layers[il].ffn_gate_exps_in_s,
235+
model.layers[il].ffn_down_exps_in_s);
228236
cb(moe_out, "ffn_moe_out", il);
229237

230238
// shared expert
@@ -234,7 +242,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
234242
model.layers[il].ffn_gate_shexp, NULL, NULL,
235243
model.layers[il].ffn_down_shexp, NULL, NULL,
236244
NULL,
237-
LLM_FFN_SILU, LLM_FFN_PAR, il);
245+
LLM_FFN_SILU, LLM_FFN_PAR, il,
246+
model.layers[il].ffn_up_shexp_in_s,
247+
model.layers[il].ffn_gate_shexp_in_s,
248+
model.layers[il].ffn_down_shexp_in_s);
238249
cb(ffn_shexp, "ffn_shexp", il);
239250

240251
cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -249,7 +260,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
249260
model.layers[il].ffn_gate, NULL, NULL,
250261
model.layers[il].ffn_down, NULL, NULL,
251262
NULL,
252-
LLM_FFN_SILU, LLM_FFN_PAR, il);
263+
LLM_FFN_SILU, LLM_FFN_PAR, il,
264+
model.layers[il].ffn_up_in_s,
265+
model.layers[il].ffn_gate_in_s,
266+
model.layers[il].ffn_down_in_s);
253267
cb(cur, "ffn_out", il);
254268
}
255269

@@ -277,7 +291,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
277291
res->t_embd = cur;
278292

279293
// lm_head
280-
cur = build_lora_mm(model.output, cur, model.output_s);
294+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
281295
cb(cur, "result_output", -1);
282296
res->t_logits = cur;
283297

src/models/apertus.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
125125
cb(cur, "ffn_norm", il);
126126

127127
// Up projection
128-
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
128+
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur, model.layers[il].ffn_up_s, model.layers[il].ffn_up_in_s);
129129
cb(up, "ffn_up", il);
130130

131131
float alpha_n_val = hparams.xielu_alpha_n[il];
@@ -138,7 +138,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
138138
cb(activated, "ffn_xielu", il);
139139

140140
// Down projection
141-
cur = build_lora_mm(model.layers[il].ffn_down, activated);
141+
cur = build_lora_mm(model.layers[il].ffn_down, activated, model.layers[il].ffn_down_s, model.layers[il].ffn_down_in_s);
142142
cb(cur, "ffn_down", il);
143143
}
144144

@@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
160160
res->t_embd = cur;
161161

162162
// lm_head
163-
cur = build_lora_mm(model.output, cur, model.output_s);
163+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
164164

165165
cb(cur, "result_output", -1);
166166
res->t_logits = cur;

src/models/arcee.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,10 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
125125
NULL, NULL, NULL,
126126
model.layers[il].ffn_down, NULL, NULL,
127127
NULL,
128-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
128+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il,
129+
model.layers[il].ffn_up_in_s,
130+
nullptr,
131+
model.layers[il].ffn_down_in_s);
129132
cb(cur, "ffn_out", il);
130133

131134
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -148,7 +151,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
148151
res->t_embd = cur;
149152

150153
// lm_head
151-
cur = build_lora_mm(model.output, cur, model.output_s);
154+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
152155

153156
cb(cur, "result_output", -1);
154157
res->t_logits = cur;

src/models/arctic.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
126126
model.layers[il].ffn_gate, NULL, NULL,
127127
model.layers[il].ffn_down, NULL, NULL,
128128
NULL,
129-
LLM_FFN_SILU, LLM_FFN_PAR, il);
129+
LLM_FFN_SILU, LLM_FFN_PAR, il,
130+
model.layers[il].ffn_up_in_s,
131+
model.layers[il].ffn_gate_in_s,
132+
model.layers[il].ffn_down_in_s);
130133
cb(cur, "ffn_out", il);
131134

132135
ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
@@ -148,7 +151,15 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
148151
LLM_FFN_SILU, true,
149152
hparams.expert_weights_scale,
150153
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
151-
il);
154+
il,
155+
nullptr,
156+
nullptr,
157+
model.layers[il].ffn_up_exps_s,
158+
model.layers[il].ffn_gate_exps_s,
159+
model.layers[il].ffn_down_exps_s,
160+
model.layers[il].ffn_up_exps_in_s,
161+
model.layers[il].ffn_gate_exps_in_s,
162+
model.layers[il].ffn_down_exps_in_s);
152163
cb(cur, "ffn_moe_out", il);
153164

154165
cur = ggml_add(ctx0, cur, ffn_out);
@@ -171,7 +182,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
171182
res->t_embd = cur;
172183

173184
// lm_head
174-
cur = build_lora_mm(model.output, cur, model.output_s);
185+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
175186

176187
cb(cur, "result_output", -1);
177188
res->t_logits = cur;

src/models/arwkv7.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,10 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
176176
model.layers[il].ffn_gate, NULL, NULL,
177177
model.layers[il].ffn_down, NULL, NULL,
178178
NULL,
179-
LLM_FFN_SILU, LLM_FFN_PAR, il);
179+
LLM_FFN_SILU, LLM_FFN_PAR, il,
180+
model.layers[il].ffn_up_in_s,
181+
model.layers[il].ffn_gate_in_s,
182+
model.layers[il].ffn_down_in_s);
180183
cb(cur, "ffn_out", il);
181184

182185
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -193,7 +196,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
193196
cb(cur, "result_norm", -1);
194197
res->t_embd = cur;
195198

196-
cur = build_lora_mm(model.output, cur, model.output_s);
199+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
197200

198201
cb(cur, "result_output", -1);
199202
res->t_logits = cur;

src/models/baichuan.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,10 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
123123
model.layers[il].ffn_gate, NULL, NULL,
124124
model.layers[il].ffn_down, NULL, NULL,
125125
NULL,
126-
LLM_FFN_SILU, LLM_FFN_PAR, il);
126+
LLM_FFN_SILU, LLM_FFN_PAR, il,
127+
model.layers[il].ffn_up_in_s,
128+
model.layers[il].ffn_gate_in_s,
129+
model.layers[il].ffn_down_in_s);
127130
cb(cur, "ffn_out", il);
128131
}
129132

@@ -146,7 +149,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
146149
res->t_embd = cur;
147150

148151
// lm_head
149-
cur = build_lora_mm(model.output, cur, model.output_s);
152+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
150153

151154
cb(cur, "result_output", -1);
152155
res->t_logits = cur;

src/models/bailingmoe.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,15 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
135135
LLM_FFN_SILU, hparams.expert_weights_norm,
136136
hparams.expert_weights_scale,
137137
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
138-
il);
138+
il,
139+
nullptr,
140+
nullptr,
141+
model.layers[il].ffn_up_exps_s,
142+
model.layers[il].ffn_gate_exps_s,
143+
model.layers[il].ffn_down_exps_s,
144+
model.layers[il].ffn_up_exps_in_s,
145+
model.layers[il].ffn_gate_exps_in_s,
146+
model.layers[il].ffn_down_exps_in_s);
139147
cb(moe_out, "ffn_moe_out", il);
140148

141149
// FFN shared expert
@@ -145,7 +153,10 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
145153
model.layers[il].ffn_gate_shexp, NULL, NULL,
146154
model.layers[il].ffn_down_shexp, NULL, NULL,
147155
NULL,
148-
LLM_FFN_SILU, LLM_FFN_PAR, il);
156+
LLM_FFN_SILU, LLM_FFN_PAR, il,
157+
model.layers[il].ffn_up_shexp_in_s,
158+
model.layers[il].ffn_gate_shexp_in_s,
159+
model.layers[il].ffn_down_shexp_in_s);
149160
cb(ffn_shexp, "ffn_shexp", il);
150161

151162
cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -171,7 +182,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
171182
res->t_embd = cur;
172183

173184
// lm_head
174-
cur = build_lora_mm(model.output, cur, model.output_s);
185+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
175186

176187
cb(cur, "result_output", -1);
177188
res->t_logits = cur;

src/models/bailingmoe2.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,10 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
163163
model.layers[il].ffn_up, NULL, NULL,
164164
model.layers[il].ffn_gate, NULL, NULL,
165165
model.layers[il].ffn_down, NULL, NULL,
166-
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
166+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
167+
model.layers[il].ffn_up_in_s,
168+
model.layers[il].ffn_gate_in_s,
169+
model.layers[il].ffn_down_in_s);
167170
cb(cur, "ffn_out", il);
168171
} else {
169172
ggml_tensor * moe_out = build_moe_ffn(cur,
@@ -176,7 +179,15 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
176179
LLM_FFN_SILU, hparams.expert_weights_norm,
177180
hparams.expert_weights_scale,
178181
(llama_expert_gating_func_type) hparams.expert_gating_func,
179-
il);
182+
il,
183+
nullptr,
184+
nullptr,
185+
model.layers[il].ffn_up_exps_s,
186+
model.layers[il].ffn_gate_exps_s,
187+
model.layers[il].ffn_down_exps_s,
188+
model.layers[il].ffn_up_exps_in_s,
189+
model.layers[il].ffn_gate_exps_in_s,
190+
model.layers[il].ffn_down_exps_in_s);
180191
cb(moe_out, "ffn_moe_out", il);
181192

182193
{
@@ -185,7 +196,10 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
185196
model.layers[il].ffn_up_shexp, NULL, NULL,
186197
model.layers[il].ffn_gate_shexp, NULL, NULL,
187198
model.layers[il].ffn_down_shexp, NULL, NULL,
188-
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
199+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
200+
model.layers[il].ffn_up_shexp_in_s,
201+
model.layers[il].ffn_gate_shexp_in_s,
202+
model.layers[il].ffn_down_shexp_in_s);
189203
cb(ffn_shexp, "ffn_shexp", il);
190204

191205
cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -210,7 +224,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
210224
res->t_embd = cur;
211225

212226
// lm_head
213-
cur = build_lora_mm(model.output, cur, model.output_s);
227+
cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
214228

215229
cb(cur, "result_output", -1);
216230
res->t_logits = cur;

src/models/bert.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,15 +186,25 @@ llama_model_bert::graph::graph(const llama_model & model, const llm_graph_params
186186
LLM_FFN_GELU, false,
187187
hparams.expert_weights_scale,
188188
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
189-
il);
189+
il,
190+
nullptr, nullptr,
191+
model.layers[il].ffn_up_exps_s,
192+
nullptr,
193+
model.layers[il].ffn_down_exps_s,
194+
model.layers[il].ffn_up_exps_in_s,
195+
nullptr,
196+
model.layers[il].ffn_down_exps_in_s);
190197
cb(cur, "ffn_moe_out", il);
191198
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
192199
model.arch == LLM_ARCH_JINA_BERT_V3) {
193200
cur = build_ffn(cur,
194201
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
195202
NULL, NULL, NULL,
196203
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
197-
LLM_FFN_GELU, LLM_FFN_SEQ, il);
204+
LLM_FFN_GELU, LLM_FFN_SEQ, il,
205+
model.layers[il].ffn_up_in_s,
206+
nullptr,
207+
model.layers[il].ffn_down_in_s);
198208
cb(cur, "ffn_out", il);
199209
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
200210
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
@@ -203,14 +213,20 @@ llama_model_bert::graph::graph(const llama_model & model, const llm_graph_params
203213
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
204214
model.layers[il].ffn_gate, NULL, NULL,
205215
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
206-
type_op, LLM_FFN_PAR, il);
216+
type_op, LLM_FFN_PAR, il,
217+
model.layers[il].ffn_up_in_s,
218+
model.layers[il].ffn_gate_in_s,
219+
model.layers[il].ffn_down_in_s);
207220
cb(cur, "ffn_out", il);
208221
} else {
209222
cur = build_ffn(cur,
210223
model.layers[il].ffn_up, NULL, NULL,
211224
model.layers[il].ffn_gate, NULL, NULL,
212225
model.layers[il].ffn_down, NULL, NULL,
213-
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
226+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
227+
model.layers[il].ffn_up_in_s,
228+
model.layers[il].ffn_gate_in_s,
229+
model.layers[il].ffn_down_in_s);
214230
cb(cur, "ffn_out", il);
215231
}
216232

src/models/bitnet.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ llama_model_bitnet::graph::graph(const llama_model & model, const llm_graph_para
103103
LLM_NORM_RMS, il);
104104
cb(cur, "attn_sub_norm", il);
105105

106-
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
106+
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s, model.layers[il].wo_in_s);
107107
if (model.layers[il].wo_b) {
108108
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
109109
}
@@ -129,15 +129,18 @@ llama_model_bitnet::graph::graph(const llama_model & model, const llm_graph_para
129129
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
130130
NULL, NULL, NULL,
131131
NULL,
132-
LLM_FFN_SILU, LLM_FFN_PAR, il);
132+
LLM_FFN_SILU, LLM_FFN_PAR, il,
133+
model.layers[il].ffn_up_in_s,
134+
model.layers[il].ffn_gate_in_s,
135+
nullptr);
133136
cb(cur, "ffn_sub_out", il);
134137

135138
cur = build_norm(cur,
136139
model.layers[il].ffn_sub_norm, NULL,
137140
LLM_NORM_RMS, il);
138141
cb(cur, "ffn_sub_norm", il);
139142

140-
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
143+
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s, model.layers[il].ffn_down_in_s);
141144
cb(cur, "ffn_down", il);
142145

143146
cur = ggml_add(ctx0, cur, ffn_inp);

0 commit comments

Comments
 (0)