Skip to content

Commit 10e5b14

Browse files
authored
llama-quant : correct n_attention_wv usage (ggml-org#20357)
* llama-quant : correct `n_attention_wv` usage In ggml-org#19770, I introduced a regression in the way the `quantize_state_impl` counter values were initialized. I was incrementing and using `n_attention_wv` in the same loop, when it should have been fixed by the time we're deciding tensor types in `llama_tensor_get_type_impl` (for `use_more_bits`). I never observed a difference in any of [my tests](ggml-org#19770 (comment)) - it was only after @bartowski kindly pointed this out that I realized it was incorrect. (Thanks!) * simplify
1 parent 90b2731 commit 10e5b14

1 file changed

Lines changed: 16 additions & 13 deletions

File tree

src/llama-quant.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -870,9 +870,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
870870

871871
quantize_state_impl qs(model, params);
872872

873-
// these need to be set to n_layer by default
874-
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
875-
876873
if (params->only_copy) {
877874
ftype = ml.ftype;
878875
}
@@ -979,6 +976,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
979976
// compute tensor metadata once and cache it
980977
std::vector<tensor_metadata> metadata(tensors.size());
981978

979+
// initialize quantization state before preliminary loop (counters for use_more_bits)
980+
{
981+
for (size_t i = 0; i < tensors.size(); ++i) {
982+
const auto cat = tensor_get_category(tensors[i]->tensor->name);
983+
if (category_is_attn_v(cat)) {
984+
++qs.n_attention_wv;
985+
}
986+
if (cat == tensor_category::OUTPUT) {
987+
qs.has_tied_embeddings = false;
988+
}
989+
metadata[i].category = cat; // save and re-use the category while we're at it
990+
}
991+
// these also need to be set to n_layer by default
992+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
993+
}
994+
982995
// flag for --dry-run
983996
bool will_require_imatrix = false;
984997

@@ -991,16 +1004,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
9911004
const struct ggml_tensor * tensor = it->tensor;
9921005
const std::string name = ggml_get_name(tensor);
9931006

994-
metadata[i].category = tensor_get_category(name);
995-
996-
if (category_is_attn_v(metadata[i].category)) {
997-
++qs.n_attention_wv;
998-
}
999-
1000-
if (tensor_name_match_output_weight(name.c_str())) {
1001-
qs.has_tied_embeddings = false;
1002-
}
1003-
10041007
uint16_t i_split = params->keep_split ? it->idx : 0;
10051008
if (!ctx_outs[i_split]) {
10061009
ctx_outs[i_split].reset(gguf_init_empty());

0 commit comments

Comments
 (0)